GPU: Add explicit API to sync storage buffer back to host

PR Introduces GPU_storagebuf_sync_to_host as an explicit routine to flush GPU-resident storage buffer memory back to the host within the GPU command stream. The previous implmentation relied on implicit synchronization of resources using OpenGL barriers which does not match the paradigm of explicit APIs, where indiviaul resources may need to be tracked. This patch ensures GPU_storagebuf_read can be called without stalling the GPU pipeline while work finishes executing. There are two possible use cases: 1) If GPU_storagebuf_read is called AFTER an explicit call to GPU_storagebuf_sync_to_host, the read will be synchronized. If the dependent work is still executing on the GPU, the host will stall until GPU work has completed and results are available. 2) If GPU_storagebuf_read is called WITHOUT an explicit call to GPU_storagebuf_sync_to_host, the read will be asynchronous and whatever memory is visible to the host at that time will be used. (This is the same as assuming a sync event has already been signalled.) This patch also addresses a gap in the Metal implementation where there was missing read support for GPU-only storage buffers. This routine now uses a staging buffer to copy results if no host-visible buffer was available. Reading from a GPU-only storage buffer will always stall the host, as it is not possible to pre-flush results, as no host-resident buffer is available. Authored by Apple: Michael Parkin-White Pull Request: https://projects.blender.org/blender/blender/pulls/113456
2023-10-20 17:04:36 +02:00 · 2023-10-20 17:04:36 +02:00 · 1b0ddfa6cb
parent 7b97bc48d8
commit 1b0ddfa6cb
11 changed files with 148 additions and 9 deletions
--- a/source/blender/draw/engines/eevee_next/eevee_shadow.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shadow.cc
@ -1328,6 +1328,8 @@ void ShadowModule::set_view(View &view)

      shadow_multi_view_.compute_procedural_bounds();

+      statistics_buf_.current().async_flush_to_host();
+
      /* Isolate shadow update into own command buffer.
       * If parameter buffer exceeds limits, then other work will not be impacted.  */
      bool use_flush = (shadow_technique == ShadowTechnique::TILE_COPY) &&
--- a/source/blender/draw/intern/DRW_gpu_wrapper.hh
+++ b/source/blender/draw/intern/DRW_gpu_wrapper.hh
@ -248,6 +248,11 @@ class StorageCommon : public DataBuffer<T, len, false>, NonMovable, NonCopyable
    GPU_storagebuf_clear_to_zero(ssbo_);
  }

+  void async_flush_to_host()
+  {
+    GPU_storagebuf_sync_to_host(ssbo_);
+  }
+
  void read()
  {
    GPU_storagebuf_read(ssbo_, this->data_);
--- a/source/blender/gpu/GPU_storage_buffer.h
+++ b/source/blender/gpu/GPU_storage_buffer.h
@ -48,10 +48,24 @@ void GPU_storagebuf_clear_to_zero(GPUStorageBuf *ssbo);
 */
 void GPU_storagebuf_clear(GPUStorageBuf *ssbo, uint32_t clear_value);

+/**
+ * Explicitly sync updated storage buffer contents back to host within the GPU command stream. This
+ * ensures any changes made by the GPU are visible to the host.
+ * NOTE: This command is only valid for host-visible storage buffers.
+ */
+void GPU_storagebuf_sync_to_host(GPUStorageBuf *ssbo);
+
 /**
 * Read back content of the buffer to CPU for inspection.
 * Slow! Only use for inspection / debugging.
- * NOTE: Not synchronized. Use appropriate barrier before reading.
+ *
+ * NOTE: If GPU_storagebuf_sync_to_host is called, this command is synchronized against that call.
+ * If pending GPU updates to the storage buffer are not yet visible to the host, the command will
+ * stall until dependent GPU work has completed.
+ *
+ * Otherwise, this command is unsynchronized and will return current visible storage buffer
+ * contents immediately.
+ * Alternatively, use appropriate barrier or GPU_finish before reading.
 */
 void GPU_storagebuf_read(GPUStorageBuf *ssbo, void *data);

--- a/source/blender/gpu/intern/gpu_storage_buffer.cc
+++ b/source/blender/gpu/intern/gpu_storage_buffer.cc
@ -106,6 +106,11 @@ void GPU_storagebuf_copy_sub_from_vertbuf(
  unwrap(ssbo)->copy_sub(unwrap(src), dst_offset, src_offset, copy_size);
 }

+void GPU_storagebuf_sync_to_host(GPUStorageBuf *ssbo)
+{
+  unwrap(ssbo)->async_flush_to_host();
+}
+
 void GPU_storagebuf_read(GPUStorageBuf *ssbo, void *data)
 {
  unwrap(ssbo)->read(data);
--- a/source/blender/gpu/intern/gpu_storage_buffer_private.hh
+++ b/source/blender/gpu/intern/gpu_storage_buffer_private.hh
@ -47,6 +47,7 @@ class StorageBuf {
  virtual void clear(uint32_t clear_value) = 0;
  virtual void copy_sub(VertBuf *src, uint dst_offset, uint src_offset, uint copy_size) = 0;
  virtual void read(void *data) = 0;
+  virtual void async_flush_to_host() = 0;
 };

 /* Syntactic sugar. */
--- a/source/blender/gpu/metal/mtl_storage_buffer.hh
+++ b/source/blender/gpu/metal/mtl_storage_buffer.hh
@ -54,6 +54,10 @@ class MTLStorageBuf : public StorageBuf {
  /** Usage type. */
  GPUUsageType usage_;

+  /* Synchronization event for host reads. */
+  id<MTLSharedEvent> gpu_write_fence_ = nil;
+  uint64_t host_read_signal_value_ = 0;
+
 public:
  MTLStorageBuf(size_t size, GPUUsageType usage, const char *name);
  ~MTLStorageBuf();
@ -68,6 +72,7 @@ class MTLStorageBuf : public StorageBuf {
  void clear(uint32_t clear_value) override;
  void copy_sub(VertBuf *src, uint dst_offset, uint src_offset, uint copy_size) override;
  void read(void *data) override;
+  void async_flush_to_host() override;

  void init();

--- a/source/blender/gpu/metal/mtl_storage_buffer.mm
+++ b/source/blender/gpu/metal/mtl_storage_buffer.mm
@ -8,6 +8,7 @@

 #include "BLI_string.h"

+#include "GPU_state.h"
 #include "gpu_backend.hh"
 #include "gpu_context_private.hh"

@ -19,6 +20,8 @@
 #include "mtl_uniform_buffer.hh"
 #include "mtl_vertex_buffer.hh"

+#include "PIL_time.h"
+
 namespace blender::gpu {

 /* -------------------------------------------------------------------- */
@ -71,6 +74,11 @@ MTLStorageBuf::~MTLStorageBuf()
    has_data_ = false;
  }

+  if (gpu_write_fence_ != nil) {
+    [gpu_write_fence_ release];
+    gpu_write_fence_ = nil;
+  }
+
  /* Ensure SSBO is not bound to active CTX.
   * SSBO bindings are reset upon Context-switch so we do not need
   * to check deactivated context's. */
@ -172,6 +180,7 @@ void MTLStorageBuf::update(const void *data)
                          toBuffer:dst_buf
                 destinationOffset:0
                              size:size_in_bytes_];
+      staging_buf->free();
    }
    else {
      /* Upload data. */
@ -323,6 +332,40 @@ void MTLStorageBuf::copy_sub(VertBuf *src_, uint dst_offset, uint src_offset, ui
                          size:copy_size];
 }

+void MTLStorageBuf::async_flush_to_host()
+{
+  bool device_only = (usage_ == GPU_USAGE_DEVICE_ONLY);
+  BLI_assert_msg(!device_only,
+                 "Storage buffers with usage GPU_USAGE_DEVICE_ONLY cannot have their data "
+                 "synchronized to the host.");
+  if (device_only) {
+    return;
+  }
+
+  MTLContext *ctx = MTLContext::get();
+  BLI_assert(ctx);
+
+  if (gpu_write_fence_ == nil) {
+    gpu_write_fence_ = [ctx->device newSharedEvent];
+  }
+
+  if (metal_buffer_ == nullptr) {
+    this->init();
+  }
+
+  /* For discrete memory systems, explicitly flush GPU-resident memory back to host. */
+  id<MTLBuffer> storage_buf_mtl = this->metal_buffer_->get_metal_buffer();
+  if (storage_buf_mtl.storageMode == MTLStorageModeManaged) {
+    id<MTLBlitCommandEncoder> blit_encoder = ctx->main_command_buffer.ensure_begin_blit_encoder();
+    [blit_encoder synchronizeResource:storage_buf_mtl];
+  }
+
+  /* Encode event signal and flush command buffer to ensure GPU work is in the pipeline for future
+   * reads. */
+  ctx->main_command_buffer.encode_signal_event(gpu_write_fence_, ++host_read_signal_value_);
+  GPU_flush();
+}
+
 void MTLStorageBuf::read(void *data)
 {
  if (data == nullptr) {
@ -333,19 +376,71 @@ void MTLStorageBuf::read(void *data)
    this->init();
  }

-  /* Managed buffers need to be explicitly flushed back to host. */
-  if (metal_buffer_->get_resource_options() & MTLResourceStorageModeManaged) {
+  /* Device-only storage buffers cannot be read directly and require staging. This path should only
+  be used for unit testing. */
+  bool device_only = (usage_ == GPU_USAGE_DEVICE_ONLY);
+  if (device_only) {
+    /** Read storage buffer contents via staging buffer. */
    /* Fetch active context. */
    MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get()));
    BLI_assert(ctx);

-    /* Ensure GPU updates are flushed back to CPU. */
-    id<MTLBlitCommandEncoder> blit_encoder = ctx->main_command_buffer.ensure_begin_blit_encoder();
-    [blit_encoder synchronizeResource:metal_buffer_->get_metal_buffer()];
-  }
+    /* Prepare staging buffer. */
+    gpu::MTLBuffer *staging_buf = MTLContext::get_global_memory_manager()->allocate(size_in_bytes_,
+                                                                                    true);
+    id<MTLBuffer> staging_buf_mtl = staging_buf->get_metal_buffer();
+    BLI_assert(staging_buf_mtl != nil);

-  /* Read data. NOTE: Unless explicitly synchronized with GPU work, results may not be ready. */
-  memcpy(data, metal_buffer_->get_host_ptr(), size_in_bytes_);
+    /* Ensure destination buffer. */
+    id<MTLBuffer> storage_buf_mtl = this->metal_buffer_->get_metal_buffer();
+    BLI_assert(storage_buf_mtl != nil);
+
+    id<MTLBlitCommandEncoder> blit_encoder = ctx->main_command_buffer.ensure_begin_blit_encoder();
+    [blit_encoder copyFromBuffer:storage_buf_mtl
+                    sourceOffset:0
+                        toBuffer:staging_buf_mtl
+               destinationOffset:0
+                            size:size_in_bytes_];
+    if (staging_buf_mtl.storageMode == MTLStorageModeManaged) {
+      [blit_encoder synchronizeResource:staging_buf_mtl];
+    }
+
+    /* Device-only reads will always stall the GPU pipe. */
+    GPU_finish();
+    MTL_LOG_WARNING(
+        "Device-only storage buffer being read. This will stall the GPU pipeline. Ensure this "
+        "path is only used in testing.");
+
+    /* Read contents back to data. */
+    memcpy(data, staging_buf->get_host_ptr(), size_in_bytes_);
+    staging_buf->free();
+  }
+  else {
+    /** Direct storage buffer read. */
+    /* If we have a synchronization event from a prior memory sync, ensure memory is fully synced.
+     * Otherwise, assume read is asynchronous. */
+    if (gpu_write_fence_ != nil) {
+      /* Ensure the GPU updates are visible to the host before reading. */
+      while (gpu_write_fence_.signaledValue < host_read_signal_value_) {
+        PIL_sleep_ms(1);
+      }
+    }
+
+    /* Managed buffers need to be explicitly flushed back to host. */
+    if (metal_buffer_->get_resource_options() & MTLResourceStorageModeManaged) {
+      /* Fetch active context. */
+      MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get()));
+      BLI_assert(ctx);
+
+      /* Ensure GPU updates are flushed back to CPU. */
+      id<MTLBlitCommandEncoder> blit_encoder =
+          ctx->main_command_buffer.ensure_begin_blit_encoder();
+      [blit_encoder synchronizeResource:metal_buffer_->get_metal_buffer()];
+    }
+
+    /* Read data. NOTE: Unless explicitly synchronized with GPU work, results may not be ready. */
+    memcpy(data, metal_buffer_->get_host_ptr(), size_in_bytes_);
+  }
 }

 id<MTLBuffer> MTLStorageBuf::get_metal_buffer()
--- a/source/blender/gpu/opengl/gl_storage_buffer.cc
+++ b/source/blender/gpu/opengl/gl_storage_buffer.cc
@ -161,6 +161,11 @@ void GLStorageBuf::copy_sub(VertBuf *src_, uint dst_offset, uint src_offset, uin
  }
 }

+void GLStorageBuf::async_flush_to_host()
+{
+  GPU_memory_barrier(GPU_BARRIER_BUFFER_UPDATE);
+}
+
 void GLStorageBuf::read(void *data)
 {
  if (ssbo_id_ == 0) {
--- a/source/blender/gpu/opengl/gl_storage_buffer.hh
+++ b/source/blender/gpu/opengl/gl_storage_buffer.hh
@ -37,6 +37,7 @@ class GLStorageBuf : public StorageBuf {
  void clear(uint32_t clear_value) override;
  void copy_sub(VertBuf *src, uint dst_offset, uint src_offset, uint copy_size) override;
  void read(void *data) override;
+  void async_flush_to_host() override;

  /* Special internal function to bind SSBOs to indirect argument targets. */
  void bind_as(GLenum target);
--- a/source/blender/gpu/vulkan/vk_storage_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_storage_buffer.cc
@ -91,6 +91,11 @@ void VKStorageBuffer::copy_sub(VertBuf *src, uint dst_offset, uint src_offset, u
  command_buffer.submit();
 }

+void VKStorageBuffer::async_flush_to_host()
+{
+  GPU_memory_barrier(GPU_BARRIER_BUFFER_UPDATE);
+}
+
 void VKStorageBuffer::read(void *data)
 {
  ensure_allocated();
--- a/source/blender/gpu/vulkan/vk_storage_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_storage_buffer.hh
@ -33,6 +33,7 @@ class VKStorageBuffer : public StorageBuf, public VKBindableResource {
  void clear(uint32_t clear_value) override;
  void copy_sub(VertBuf *src, uint dst_offset, uint src_offset, uint copy_size) override;
  void read(void *data) override;
+  void async_flush_to_host() override;

  VkBuffer vk_handle() const
  {