Metal: Add support for atomic image operations

Texture Atomics have been added in Metal 3.1 and enable the original implementations of shadow update and irradiance cache baking. However, a fallback solution will be required for versions under macOS 14.0 utilising buffer-backed textures instead. This patch also includes a stub implementation if building/running on older macOS versions which provides locally-synchronized texture access in place of atomics. This enables some effects to be partially tested, and ensures non-guarded use of imageAtomic functions does not result in compilation failure. Authored by Apple: Michael Parkin-White Pull Request: https://projects.blender.org/blender/blender/pulls/112866
2023-09-25 21:56:46 +02:00 · 2023-09-25 21:56:46 +02:00 · ee03bb38cb
parent 499c39cfb9
commit ee03bb38cb
9 changed files with 195 additions and 12 deletions
--- a/source/blender/draw/engines/eevee_next/eevee_irradiance_cache.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_irradiance_cache.cc
@ -942,7 +942,8 @@ void IrradianceBake::clusters_build()
  if (max_virtual_offset_ == 0.0f) {
    return;
  }
-  eGPUTextureUsage texture_usage = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE;
+  eGPUTextureUsage texture_usage = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE |
+                                   GPU_TEXTURE_USAGE_ATOMIC;

  cluster_list_tx_.ensure_3d(GPU_R32I, capture_info_buf_.irradiance_grid_size, texture_usage);
  cluster_list_tx_.clear(int4(-1));
--- a/source/blender/draw/engines/eevee_next/eevee_shadow.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shadow.cc
@ -662,7 +662,8 @@ void ShadowModule::init()
  const int2 atlas_extent = shadow_page_size_ * int2(SHADOW_PAGE_PER_ROW);
  const int atlas_layers = divide_ceil_u(shadow_page_len_, SHADOW_PAGE_PER_LAYER);

-  eGPUTextureUsage tex_usage = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE;
+  eGPUTextureUsage tex_usage = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE |
+                               GPU_TEXTURE_USAGE_ATOMIC;
  if (atlas_tx_.ensure_2d_array(atlas_type, atlas_extent, atlas_layers, tex_usage)) {
    /* Global update. */
    do_full_update = true;
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_irradiance_cache_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_irradiance_cache_info.hh
@ -91,6 +91,7 @@ GPU_SHADER_CREATE_INFO(eevee_surfel_cluster_build)

 GPU_SHADER_CREATE_INFO(eevee_surfel_list_build)
    .local_group_size(SURFEL_GROUP_SIZE)
+    .builtins(BuiltinBits::TEXTURE_ATOMIC)
    .additional_info("eevee_shared", "eevee_surfel_common", "draw_view")
    .storage_buf(0, Qualifier::READ_WRITE, "int", "list_start_buf[]")
    .storage_buf(6, Qualifier::READ_WRITE, "SurfelListInfoData", "list_info_buf")
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
@ -193,6 +193,7 @@ GPU_SHADER_CREATE_INFO(eevee_surf_shadow)
    .define("USE_ATOMIC")
    .builtins(BuiltinBits::VIEWPORT_INDEX)
    .builtins(BuiltinBits::LAYER)
+    .builtins(BuiltinBits::TEXTURE_ATOMIC)
    .vertex_out(eevee_surf_iface)
    .vertex_out(eevee_surf_flat_iface)
    .storage_buf(SHADOW_RENDER_MAP_BUF_SLOT,
--- a/source/blender/gpu/GPU_texture.h
+++ b/source/blender/gpu/GPU_texture.h
@ -548,9 +548,12 @@ typedef enum eGPUTextureUsage {
  /* When used, the texture will not have any backing storage and can solely exist as a virtual
   * frame-buffer attachment. */
  GPU_TEXTURE_USAGE_MEMORYLESS = (1 << 5),
+  /* Whether a texture can support atomic operations. */
+  GPU_TEXTURE_USAGE_ATOMIC = (1 << 6),
  /* Create a texture whose usage cannot be defined prematurely.
   * This is unoptimized and should not be used. */
-  GPU_TEXTURE_USAGE_GENERAL = (0xFF & (~GPU_TEXTURE_USAGE_MEMORYLESS)),
+  GPU_TEXTURE_USAGE_GENERAL = (0xFF &
+                               (~(GPU_TEXTURE_USAGE_MEMORYLESS | GPU_TEXTURE_USAGE_ATOMIC))),
 } eGPUTextureUsage;

 ENUM_OPERATORS(eGPUTextureUsage, GPU_TEXTURE_USAGE_GENERAL);
--- a/source/blender/gpu/intern/gpu_shader_create_info.hh
+++ b/source/blender/gpu/intern/gpu_shader_create_info.hh
@ -208,6 +208,9 @@ enum class BuiltinBits {
   */
  VIEWPORT_INDEX = (1 << 17),

+  /* Texture atomics requires usage options to alter compilation flag. */
+  TEXTURE_ATOMIC = (1 << 18),
+
  /* Not a builtin but a flag we use to tag shaders that use the debug features. */
  USE_DEBUG_DRAW = (1 << 29),
  USE_DEBUG_PRINT = (1 << 30),
--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@ -300,6 +300,14 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info)
        options.languageVersion = MTLLanguageVersion2_3;
      }
    }
+#if defined(MAC_OS_VERSION_14_0)
+    if (@available(macOS 14.00, *)) {
+      /* Texture atomics require Metal 3.1. */
+      if (bool(info->builtins_ & BuiltinBits::TEXTURE_ATOMIC)) {
+        options.languageVersion = MTLLanguageVersion3_1;
+      }
+    }
+#endif

    NSString *source_to_compile = shd_builder_->msl_source_vert_;

--- a/source/blender/gpu/metal/mtl_texture.hh
+++ b/source/blender/gpu/metal/mtl_texture.hh
@ -618,6 +618,14 @@ inline MTLTextureUsage mtl_usage_from_gpu(eGPUTextureUsage usage)
  if (usage & GPU_TEXTURE_USAGE_MIP_SWIZZLE_VIEW) {
    mtl_usage = mtl_usage | MTLTextureUsagePixelFormatView;
  }
+#if defined(MAC_OS_VERSION_14_0)
+  if (@available(macOS 14.0, *)) {
+    if (usage & GPU_TEXTURE_USAGE_ATOMIC) {
+
+      mtl_usage = mtl_usage | MTLTextureUsageShaderAtomic;
+    }
+  }
+#endif
  return mtl_usage;
 }

--- a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
+++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
@ -201,15 +201,6 @@ template<typename T> T atomicExchange(device T &mem, T data)
  return atomic_exchange_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
 }

-/* Unblock texture atomic compilation.
- * TODO(Metal): This is not correct for global atomic behavior, but will be safe within a single
- * thread.
- * We need to re-visit the solution for this use-case and use a 2D texture buffer instead. */
-#define imageAtomicMin(tex, coord, data) \
-  uint val = _texelFetch_internal(tex, coord, 0).r; \
-  _texture_write_internal(tex, coord, uint4((val < data) ? val : data)); \
-  tex.texture->fence();
-
 /* Used to replace 'out' in function parameters with thread-local reference
 * shortened to avoid expanding the GLSL source string. */
 #define THD thread
@ -984,6 +975,172 @@ inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_3d<S
  tex.texture->write(value, uint3(_coord.xyz));
 }

+/* Texture atomic operations are only supported in Metal 3.1 and onwards (macOS 14.0 Sonoma). */
+#if __METAL_VERSION__ >= 310
+
+/* Image atomic operations. */
+#  define imageAtomicMin(tex, coord, data) _texture_image_atomic_min_internal(tex, coord, data)
+#  define imageAtomicExchange(tex, coord, data) \
+    _texture_image_atomic_exchange_internal(tex, coord, data)
+
+/* Atomic Min. */
+template<typename S, access A>
+vec<S, 4> _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                             int coord,
+                                             vec<S, 4> data)
+{
+  return tex.texture->atomic_fetch_min(uint(coord), data);
+}
+
+template<typename S, access A>
+vec<S, 4> _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
+                                             int2 coord,
+                                             vec<S, 4> data)
+{
+  return tex.texture->atomic_fetch_min(uint(coord.x), uint(coord.y), data);
+}
+
+template<typename S, access A>
+vec<S, 4> _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
+                                             int2 coord,
+                                             vec<S, 4> data)
+{
+  return tex.texture->atomic_fetch_min(uint2(coord.xy), data);
+}
+
+template<typename S, access A>
+vec<S, 4> _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
+                                             int3 coord,
+                                             vec<S, 4> data)
+{
+  return tex.texture->atomic_fetch_min(uint2(coord.xy), uint(coord.z), data);
+}
+
+template<typename S, access A>
+vec<S, 4> _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
+                                             int3 coord,
+                                             vec<S, 4> data)
+{
+  return tex.texture->atomic_fetch_min(uint3(coord), data);
+}
+
+/* Atomic Exchange. */
+template<typename S, access A, int N>
+vec<S, N> _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                                  int coord,
+                                                  vec<S, N> data)
+{
+  return tex.texture->atomic_exchange(uint(coord), data);
+}
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
+                                          int2 coord,
+                                          S data)
+{
+  return tex.texture->atomic_exchange(uint(coord.x), uint(coord.y), data);
+}
+
+template<typename S, access A, int N>
+S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
+                                          int2 coord,
+                                          S data)
+{
+  return tex.texture->atomic_exchange(uint2(coord.xy), data);
+}
+
+template<typename S, access A, int N>
+S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
+                                          int3 coord,
+                                          S data)
+{
+  return tex.texture->atomic_exchange(uint2(coord.xy), uint(coord.z), data);
+}
+
+template<typename S, access A, int N>
+S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
+                                          int3 coord,
+                                          S data)
+{
+  return tex.texture->atomic_exchange(uint3(coord), data);
+}
+
+#else
+/* Fallback for imageAtomicMin if atomics are unsupported.
+ * for general concurrent thread access, but will allow inter-thread writing.
+ * This assumes 2D texture array.
+ * NOTE: Implementations should switch to a buffer-backed texture write in these cases. */
+#  define imageAtomicMin(tex, coord, data) \
+    auto val = _texelFetch_internal(tex, coord, 0).r; \
+    tex.texture->fence(); \
+    tex.texture->write(min(val, data), uint2(coord.xy), uint(coord.z)); \
+    tex.texture->fence();
+
+/* Fallback for atomic Exchange if atomics are unavailable.
+ * NOTE: Implementations should switch to a buffer-backed texture write in these cases. */
+#  define imageAtomicExchange(tex, coord, data) \
+    _texture_image_atomic_exchange_internal_fallback(tex, coord, data)
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal_fallback(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                                   int coord,
+                                                   S data)
+{
+  S val = tex.texture->read(uint(coord), data).x;
+  tex.texture->fence();
+  tex.texture->write(data, uint(coord));
+  tex.texture->fence();
+  return val;
+}
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal_fallback(
+    thread _mtl_combined_image_sampler_1d_array<S, A> tex, int2 coord, S data)
+{
+  S val = tex.texture->read(uint(coord.x), uint(coord.y), data).x;
+  tex.texture->fence();
+  tex.texture->write(data, uint(coord.x), uint(coord.y));
+  tex.texture->fence();
+  return val;
+}
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal_fallback(thread _mtl_combined_image_sampler_2d<S, A> tex,
+                                                   int2 coord,
+                                                   S data)
+{
+  S val = tex.texture->read(uint2(coord), data).x;
+  tex.texture->fence();
+  tex.texture->write(data, uint2(coord.x));
+  tex.texture->fence();
+  return val;
+}
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal_fallback(
+    thread _mtl_combined_image_sampler_2d_array<S, A> tex, int3 coord, S data)
+{
+  S val = tex.texture->read(uint2(coord.xy), uint(coord.z), data).x;
+  tex.texture->fence();
+  tex.texture->write(data, uint2(coord.xy), uint(coord.z));
+  tex.texture->fence();
+  return val;
+}
+
+template<typename S, access A>
+S _texture_image_atomic_exchange_internal_fallback(thread _mtl_combined_image_sampler_3d<S, A> tex,
+                                                   int3 coord,
+                                                   S data)
+{
+  S val = tex.texture->read(uint3(coord), data).x;
+  tex.texture->fence();
+  tex.texture->write(data, uint3(coord));
+  tex.texture->fence();
+  return val;
+}
+
+#endif
+
 /* Matrix compare operators. */
 /** TODO(fclem): Template. */
 inline bool operator==(float4x4 a, float4x4 b)