Vulkan: Timeline Semaphores

This change adds timeline semaphores to track submissions. The previous implementation used a fence. Timeline semaphores can be tracked in more detail as it is an counter. For each submission the counter can be stored locally and when waiting for completion the counter can be retrieved again and checked if is known to be succeeded by a higher value. The timeline semaphore is stored next to the queue and can also be used to synchronize between multiple contexts. Pull Request: https://projects.blender.org/blender/blender/pulls/115357
2023-11-24 15:23:46 +01:00 · 2023-11-24 15:23:46 +01:00 · ec772ed2f1
parent 18f7d158fe
commit ec772ed2f1
9 changed files with 264 additions and 46 deletions
--- a/intern/ghost/intern/GHOST_ContextVK.cc
+++ b/intern/ghost/intern/GHOST_ContextVK.cc
@ -247,11 +247,13 @@ class GHOST_DeviceVK {

    void *device_create_info_p_next = nullptr;

-    /* Enable optional vulkan 12 features when supported on physical device. */
+    /* Enable optional vulkan 12 features when supported on physical device.
+     * Support level for timelineSemaphores is 99%+. */
    VkPhysicalDeviceVulkan12Features vulkan_12_features = {};
    vulkan_12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
    vulkan_12_features.shaderOutputLayer = features_12.shaderOutputLayer;
    vulkan_12_features.shaderOutputViewportIndex = features_12.shaderOutputViewportIndex;
+    vulkan_12_features.timelineSemaphore = VK_TRUE;
    vulkan_12_features.pNext = device_create_info_p_next;
    device_create_info_p_next = &vulkan_12_features;

--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@ -231,6 +231,7 @@ set(VULKAN_SRC
  vulkan/vk_state_manager.cc
  vulkan/vk_storage_buffer.cc
  vulkan/vk_texture.cc
+  vulkan/vk_timeline_semaphore.cc
  vulkan/vk_uniform_buffer.cc
  vulkan/vk_vertex_attribute_object.cc
  vulkan/vk_vertex_buffer.cc
@ -271,6 +272,7 @@ set(VULKAN_SRC
  vulkan/vk_state_manager.hh
  vulkan/vk_storage_buffer.hh
  vulkan/vk_texture.hh
+  vulkan/vk_timeline_semaphore.hh
  vulkan/vk_uniform_buffer.hh
  vulkan/vk_vertex_attribute_object.hh
  vulkan/vk_vertex_buffer.hh
--- a/source/blender/gpu/vulkan/vk_command_buffers.cc
+++ b/source/blender/gpu/vulkan/vk_command_buffers.cc
@ -28,11 +28,6 @@ VKCommandBuffers::~VKCommandBuffers()
  VK_ALLOCATION_CALLBACKS;
  const VKDevice &device = VKBackend::get().device_get();

-  if (vk_fence_ != VK_NULL_HANDLE) {
-    vkDestroyFence(device.device_get(), vk_fence_, vk_allocation_callbacks);
-    vk_fence_ = VK_NULL_HANDLE;
-  }
-
  if (vk_command_pool_ != VK_NULL_HANDLE) {
    vkDestroyCommandPool(device.device_get(), vk_command_pool_, vk_allocation_callbacks);
    vk_command_pool_ = VK_NULL_HANDLE;
@ -54,7 +49,6 @@ void VKCommandBuffers::init(const VKDevice &device)
  }
  init_command_pool(device);
  init_command_buffers(device);
-  init_fence(device);
  submission_id_.reset();
 }

@ -103,38 +97,44 @@ void VKCommandBuffers::init_command_buffers(const VKDevice &device)
                      "Graphics Command Buffer");
 }

-void VKCommandBuffers::init_fence(const VKDevice &device)
+void VKCommandBuffers::submit_command_buffers(VKDevice &device,
+                                              MutableSpan<VKCommandBuffer *> command_buffers)
 {
-  if (vk_fence_ == VK_NULL_HANDLE) {
-    VK_ALLOCATION_CALLBACKS;
-    VkFenceCreateInfo fenceInfo{};
-    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-    vkCreateFence(device.device_get(), &fenceInfo, vk_allocation_callbacks, &vk_fence_);
-  }
-}
+  VKTimelineSemaphore &timeline_semaphore = device.timeline_semaphore_get();
+  VkSemaphore timeline_handle = timeline_semaphore.vk_handle();
+  VKTimelineSemaphore::Value wait_value = timeline_semaphore.value_get();
+  last_signal_value_ = timeline_semaphore.value_increase();

-static void submit_command_buffers(const VKDevice &device,
-                                   MutableSpan<VKCommandBuffer *> command_buffers,
-                                   VkFence vk_fence,
-                                   uint64_t timeout)
-{
  BLI_assert(ELEM(command_buffers.size(), 1, 2));
  VkCommandBuffer handles[2];
  int num_command_buffers = 0;
+
  for (VKCommandBuffer *command_buffer : command_buffers) {
    command_buffer->end_recording();
    handles[num_command_buffers++] = command_buffer->vk_command_buffer();
  }

+  VkTimelineSemaphoreSubmitInfo timelineInfo;
+  timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
+  timelineInfo.pNext = NULL;
+  timelineInfo.waitSemaphoreValueCount = 1;
+  timelineInfo.pWaitSemaphoreValues = wait_value;
+  timelineInfo.signalSemaphoreValueCount = 1;
+  timelineInfo.pSignalSemaphoreValues = last_signal_value_;
+  VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
  VkSubmitInfo submit_info = {};
  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
  submit_info.commandBufferCount = num_command_buffers;
  submit_info.pCommandBuffers = handles;
+  submit_info.pNext = &timelineInfo;
+  submit_info.waitSemaphoreCount = 1;
+  submit_info.pWaitSemaphores = &timeline_handle;
+  submit_info.pWaitDstStageMask = &wait_stages;
+  submit_info.signalSemaphoreCount = 1;
+  submit_info.pSignalSemaphores = &timeline_handle;

-  vkQueueSubmit(device.queue_get(), 1, &submit_info, vk_fence);
-
-  vkWaitForFences(device.device_get(), 1, &vk_fence, VK_TRUE, timeout);
-  vkResetFences(device.device_get(), 1, &vk_fence);
+  vkQueueSubmit(device.queue_get(), 1, &submit_info, VK_NULL_HANDLE);
+  finish();

  for (VKCommandBuffer *command_buffer : command_buffers) {
    command_buffer->commands_submitted();
@ -144,7 +144,7 @@ static void submit_command_buffers(const VKDevice &device,

 void VKCommandBuffers::submit()
 {
-  const VKDevice &device = VKBackend::get().device_get();
+  VKDevice &device = VKBackend::get().device_get();
  VKCommandBuffer &data_transfer_compute = command_buffer_get(Type::DataTransferCompute);
  VKCommandBuffer &graphics = command_buffer_get(Type::Graphics);

@ -163,22 +163,21 @@ void VKCommandBuffers::submit()
    end_render_pass(*framebuffer);
    command_buffers[command_buffer_index++] = &graphics;
    submit_command_buffers(device,
-                           MutableSpan<VKCommandBuffer *>(command_buffers, command_buffer_index),
-                           vk_fence_,
-                           FenceTimeout);
+                           MutableSpan<VKCommandBuffer *>(command_buffers, command_buffer_index));
    begin_render_pass(*framebuffer);
  }
  else if (has_data_transfer_compute_work) {
    submit_command_buffers(device,
-                           MutableSpan<VKCommandBuffer *>(command_buffers, command_buffer_index),
-                           vk_fence_,
-                           FenceTimeout);
+                           MutableSpan<VKCommandBuffer *>(command_buffers, command_buffer_index));
  }
+}

-  const bool reset_submission_id = has_data_transfer_compute_work || has_graphics_work;
-  if (reset_submission_id) {
-    submission_id_.next();
-  }
+void VKCommandBuffers::finish()
+{
+  VKDevice &device = VKBackend::get().device_get();
+  VKTimelineSemaphore &timeline_semaphore = device.timeline_semaphore_get();
+  timeline_semaphore.wait(device, last_signal_value_);
+  submission_id_.next();
 }

 void VKCommandBuffers::ensure_no_draw_commands()
--- a/source/blender/gpu/vulkan/vk_command_buffers.hh
+++ b/source/blender/gpu/vulkan/vk_command_buffers.hh
@ -9,6 +9,7 @@
 #pragma once

 #include "vk_command_buffer.hh"
+#include "vk_timeline_semaphore.hh"

 namespace blender::gpu {
 class VKFrameBuffer;
@ -31,16 +32,12 @@ class VKCommandBuffers : public NonCopyable, NonMovable {
  };

  bool initialized_ = false;
-  /**
-   * Timeout to use when waiting for fences in nanoseconds.
-   *
-   * Currently added as the fence will halt when there are no commands in the command buffer for
-   * the second time. This should be solved and this timeout should be removed.
-   */
-  static constexpr uint64_t FenceTimeout = UINT64_MAX;

-  /* Fence for CPU GPU synchronization when submitting the command buffers. */
-  VkFence vk_fence_ = VK_NULL_HANDLE;
+  /**
+   * Last submitted timeline value, what can be used to validate that all commands related
+   * submitted by this command buffers have been finished.
+   */
+  VKTimelineSemaphore::Value last_signal_value_;

  /**
   * Active framebuffer for graphics command buffer.
@ -142,6 +139,7 @@ class VKCommandBuffers : public NonCopyable, NonMovable {
                             uint32_t stride);

  void submit();
+  void finish();

  const VKSubmissionID &submission_id_get() const
  {
@ -149,10 +147,11 @@ class VKCommandBuffers : public NonCopyable, NonMovable {
  }

 private:
-  void init_fence(const VKDevice &device);
  void init_command_pool(const VKDevice &device);
  void init_command_buffers(const VKDevice &device);

+  void submit_command_buffers(VKDevice &device, MutableSpan<VKCommandBuffer *> command_buffers);
+
  VKCommandBuffer &command_buffer_get(Type type)
  {
    return buffers_[(int)type];
--- a/source/blender/gpu/vulkan/vk_context.cc
+++ b/source/blender/gpu/vulkan/vk_context.cc
@ -127,7 +127,7 @@ void VKContext::flush()

 void VKContext::finish()
 {
-  command_buffers_.submit();
+  command_buffers_.finish();
 }

 void VKContext::memory_statistics_get(int *r_total_mem_kb, int *r_free_mem_kb)
--- a/source/blender/gpu/vulkan/vk_device.cc
+++ b/source/blender/gpu/vulkan/vk_device.cc
@ -28,6 +28,7 @@ void VKDevice::deinit()
    return;
  }

+  timeline_semaphore_.free(*this);
  dummy_buffer_.free();
  if (dummy_color_attachment_.has_value()) {
    delete &(*dummy_color_attachment_).get();
@ -74,6 +75,7 @@ void VKDevice::init(void *ghost_context)
  init_pipeline_cache();

  samplers_.init();
+  timeline_semaphore_.init(*this);

  debug::object_label(device_get(), "LogicalDevice");
  debug::object_label(queue_get(), "GenericQueue");
--- a/source/blender/gpu/vulkan/vk_device.hh
+++ b/source/blender/gpu/vulkan/vk_device.hh
@ -16,6 +16,7 @@
 #include "vk_debug.hh"
 #include "vk_descriptor_pools.hh"
 #include "vk_samplers.hh"
+#include "vk_timeline_semaphore.hh"

 namespace blender::gpu {
 class VKBackend;
@ -61,6 +62,9 @@ class VKDevice : public NonCopyable {

  VKSamplers samplers_;

+  /* Semaphore for CPU GPU synchronization when submitting commands to the queue. */
+  VKTimelineSemaphore timeline_semaphore_;
+
  /**
   * Available Contexts for this device.
   *
@ -222,6 +226,21 @@ class VKDevice : public NonCopyable {

  /** \} */

+  /* -------------------------------------------------------------------- */
+  /** \name Queue management
+   * \{ */
+
+  VKTimelineSemaphore &timeline_semaphore_get()
+  {
+    return timeline_semaphore_;
+  }
+  const VKTimelineSemaphore &timeline_semaphore_get() const
+  {
+    return timeline_semaphore_;
+  }
+
+  /** \} */
+
 private:
  void init_physical_device_properties();
  void init_physical_device_memory_properties();
--- a/source/blender/gpu/vulkan/vk_timeline_semaphore.cc
+++ b/source/blender/gpu/vulkan/vk_timeline_semaphore.cc
@ -0,0 +1,86 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/** \file
+ * \ingroup gpu
+ */
+
+#include "vk_timeline_semaphore.hh"
+#include "vk_backend.hh"
+#include "vk_device.hh"
+#include "vk_memory.hh"
+
+namespace blender::gpu {
+
+VKTimelineSemaphore::~VKTimelineSemaphore()
+{
+  const VKDevice &device = VKBackend::get().device_get();
+  free(device);
+}
+
+void VKTimelineSemaphore::init(const VKDevice &device)
+{
+  if (vk_semaphore_ != VK_NULL_HANDLE) {
+    return;
+  }
+
+  VK_ALLOCATION_CALLBACKS;
+  VkSemaphoreTypeCreateInfo semaphore_type_create_info = {};
+  semaphore_type_create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO;
+  semaphore_type_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE;
+  semaphore_type_create_info.initialValue = 0;
+
+  VkSemaphoreCreateInfo semaphore_create_info{};
+  semaphore_create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  semaphore_create_info.pNext = &semaphore_type_create_info;
+  vkCreateSemaphore(
+      device.device_get(), &semaphore_create_info, vk_allocation_callbacks, &vk_semaphore_);
+  debug::object_label(vk_semaphore_, "TimelineSemaphore");
+
+  value_.reset();
+}
+
+void VKTimelineSemaphore::free(const VKDevice &device)
+{
+  if (vk_semaphore_ == VK_NULL_HANDLE) {
+    return;
+  }
+
+  VK_ALLOCATION_CALLBACKS;
+  vkDestroySemaphore(device.device_get(), vk_semaphore_, vk_allocation_callbacks);
+  vk_semaphore_ = VK_NULL_HANDLE;
+
+  value_.reset();
+}
+
+void VKTimelineSemaphore::wait(const VKDevice &device, const Value &wait_value)
+{
+  BLI_assert(vk_semaphore_ != VK_NULL_HANDLE);
+
+  VkSemaphoreWaitInfo wait_info = {};
+  wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO;
+  wait_info.semaphoreCount = 1;
+  wait_info.pSemaphores = &vk_semaphore_;
+  wait_info.pValues = wait_value;
+  vkWaitSemaphores(device.device_get(), &wait_info, UINT64_MAX);
+  last_completed_ = wait_value;
+}
+
+VKTimelineSemaphore::Value VKTimelineSemaphore::value_increase()
+{
+  value_.increase();
+  return value_;
+}
+
+VKTimelineSemaphore::Value VKTimelineSemaphore::value_get() const
+{
+  return value_;
+}
+
+VKTimelineSemaphore::Value VKTimelineSemaphore::last_completed_value_get() const
+{
+  return last_completed_;
+}
+
+}  // namespace blender::gpu
--- a/source/blender/gpu/vulkan/vk_timeline_semaphore.hh
+++ b/source/blender/gpu/vulkan/vk_timeline_semaphore.hh
@ -0,0 +1,109 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authorss
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/** \file
+ * \ingroup gpu
+ */
+
+#pragma once
+
+#include "vk_common.hh"
+
+namespace blender::gpu {
+class VKDevice;
+
+/**
+ * A timeline semaphore is a special semaphore type used to syncronize between commands and
+ * resource usage in a time aware fasion.
+ *
+ * Synchronization is a core part of Vulkan and the Timeline Semaphore is a utility that
+ * facilitates its implementation in Blender.
+ *
+ * There are resources that needs to be tracked in time in order to know when to submit, free or
+ * reuse these resource. Some usecases are:
+ *
+ * - Command buffers can only be reset or freed when they are executed on the device. When the
+ *   command buffers are still pending for execution they may not be reused or freed.
+ * - Buffers are only allowed to be reuploaded when they are not used at this moment by the device.
+ *   This CPU/GPU synchronization can be guarded by a timeline semaphore. In this case barriers
+ *   may not be used as they don't cover CPU synchronization for host allocated buffers.
+ *
+ * Usage:
+ *
+ * For each device queue a timeline semaphore should be constructed. Every time when a command
+ * buffer is submitted the submission will wait for the current timeline value to be completed.
+ * Locally the command buffer can keep track of the timeline value when submitting commands so
+ * `gpuFinish` could be implemented is a context aware fasion.
+ *
+ * #VKTimelineSemaphore::Value can be stored locally. By calling the wait function you can ensure
+ * that at least the given value has been finished.
+ */
+class VKTimelineSemaphore {
+ public:
+  /**
+   * VKTimelineSemaphore::Value is used to track the timeline semaphore value.
+   */
+  class Value {
+    uint64_t value_ = 0;
+
+   public:
+    operator const uint64_t *() const
+    {
+      return &value_;
+    }
+
+    bool operator<(const Value &other) const
+    {
+      return this->value_ < other.value_;
+    }
+
+    bool operator==(const Value &other) const
+    {
+      return this->value_ == other.value_;
+    }
+
+   private:
+    void reset()
+    {
+      value_ = 0;
+    }
+
+    void increase()
+    {
+      value_++;
+    }
+
+    friend class VKTimelineSemaphore;
+  };
+
+ private:
+  VkSemaphore vk_semaphore_ = VK_NULL_HANDLE;
+  Value value_;
+  Value last_completed_;
+
+ public:
+  ~VKTimelineSemaphore();
+
+  void init(const VKDevice &device);
+  void free(const VKDevice &device);
+
+  /**
+   * Wait for semaphore completion.
+   *
+   * Ensuring all commands queues before and including the given value have been finished.
+   */
+  void wait(const VKDevice &device, const Value &value);
+
+  Value value_increase();
+  Value value_get() const;
+  Value last_completed_value_get() const;
+
+  VkSemaphore vk_handle() const
+  {
+    BLI_assert(vk_semaphore_ != VK_NULL_HANDLE);
+    return vk_semaphore_;
+  }
+};
+
+}  // namespace blender::gpu