Fix #116128: Resolve EEVEE Next deferred light sync in Metal

Render to Render workload depedency not correctly syncing in Metal. PR adds hard pass break where GPU_memory_barrier's occur during render workloads to ensure non-pixel-local writes are visible to subsequent render invocations as needed. This is required to support full pass dependencies on a tile-based GPU architecture. Note that these barriers are therefore expensive, so are skipped where dependencies are local and fragment execution order is well-represented via either blend order or explicit raster_order_groups. Authored by Apple: Michael Parkin-White Pull Request: https://projects.blender.org/blender/blender/pulls/116656
2024-01-03 05:38:42 +01:00 · 2024-01-03 05:38:42 +01:00 · cbcb6abfee
parent cd4328dd82
commit cbcb6abfee
2 changed files with 28 additions and 20 deletions
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
@ -508,6 +508,11 @@ void DeferredLayer::end_sync()
  if (closure_bits_ & evaluated_closures) {
    RenderBuffersInfoData &rbuf_data = inst_.render_buffers.data;

+    /* NOTE: For tile-based GPU architectures, barriers are not always needed if implicit local
+     * ordering is guarnteed via either blending order or explicit raster_order_groups. */
+    bool is_tbdr_arch_metal = (GPU_platform_architecture() == GPU_ARCHITECTURE_TBDR) &&
+                              (GPU_backend_get_type() == GPU_BACKEND_METAL);
+
    /* Add the tile classification step at the end of the GBuffer pass. */
    {
      /* Fill tile mask texture with the collected closure present in a tile. */
@ -523,7 +528,9 @@ void DeferredLayer::end_sync()
      sub.shader_set(inst_.shaders.static_shader_get(DEFERRED_TILE_CLASSIFY));
      sub.bind_image("tile_mask_img", &tile_mask_tx_);
      sub.push_constant("closure_tile_size_shift", &closure_tile_size_shift_);
-      sub.barrier(GPU_BARRIER_TEXTURE_FETCH);
+      if (!is_tbdr_arch_metal) {
+        sub.barrier(GPU_BARRIER_TEXTURE_FETCH);
+      }
      sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
    }
    {
@ -566,7 +573,9 @@ void DeferredLayer::end_sync()
        /* WORKAROUND: Avoid rasterizer discard by enabling stencil write, but the shaders actually
         * use no fragment output. */
        sub.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_STENCIL_EQUAL | DRW_STATE_DEPTH_GREATER);
-        sub.barrier(GPU_BARRIER_SHADER_STORAGE);
+        if (!is_tbdr_arch_metal) {
+          sub.barrier(GPU_BARRIER_SHADER_STORAGE);
+        }
        sub.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
        sub.bind_image(RBUFS_COLOR_SLOT, &inst_.render_buffers.rp_color_tx);
        sub.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
@ -595,17 +604,7 @@ void DeferredLayer::end_sync()
          sub.bind_resources(inst_.reflection_probes);
          sub.bind_resources(inst_.irradiance_cache);
          sub.state_stencil(0xFFu, 1u << i, 0xFFu);
-          if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
-            /* WORKAROUND: On Apple silicon the stencil test is broken. Only issue one expensive
-             * lighting evaluation. */
-            if (i == 2) {
-              sub.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_DEPTH_GREATER);
-              sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
-            }
-          }
-          else {
-            sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
-          }
+          sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
        }
      }
    }
@ -634,7 +633,9 @@ void DeferredLayer::end_sync()
      pass.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
      pass.bind_resources(inst_.gbuffer);
      pass.bind_resources(inst_.uniform_data);
-      pass.barrier(GPU_BARRIER_TEXTURE_FETCH | GPU_BARRIER_SHADER_IMAGE_ACCESS);
+      if (!is_tbdr_arch_metal) {
+        pass.barrier(GPU_BARRIER_TEXTURE_FETCH | GPU_BARRIER_SHADER_IMAGE_ACCESS);
+      }
      pass.draw_procedural(GPU_PRIM_TRIS, 1, 3);
    }
  }
--- a/source/blender/gpu/metal/mtl_command_buffer.mm
+++ b/source/blender/gpu/metal/mtl_command_buffer.mm
@ -582,12 +582,19 @@ bool MTLCommandBufferManager::insert_memory_barrier(eGPUBarrier barrier_bits,
  if (@available(macOS 10.14, *)) {

    /* Apple Silicon does not support memory barriers for RenderCommandEncoder's.
-     * We do not currently need these due to implicit API guarantees.
-     * NOTE(Metal): MTLFence/MTLEvent may be required to synchronize work if
-     * untracked resources are ever used. */
-    if ([context_.device hasUnifiedMemory] &&
-        (active_command_encoder_type_ != MTL_COMPUTE_COMMAND_ENCODER))
-    {
+     * We do not currently need these due to implicit API guarantees. However, render->render
+     * resource dependencies are only evaluated at RenderCommandEncoder boundaries due to work
+     * execution on TBDR architecture.
+     *
+     * NOTE: Render barriers are therefore inherently expensive. Where possible, opt for local
+     * synchronization using raster order groups, or, prefer compute to avoid subsequent passes
+     * re-loading pass attachments which are not needed. */
+    const bool is_tile_based_arch = (GPU_platform_architecture() == GPU_ARCHITECTURE_TBDR);
+    if (is_tile_based_arch && (active_command_encoder_type_ != MTL_COMPUTE_COMMAND_ENCODER)) {
+      if (active_command_encoder_type_ == MTL_RENDER_COMMAND_ENCODER) {
+        end_active_command_encoder();
+        return true;
+      }
      return false;
    }