From d1a9425a2fde32b6786b333ab55661da507e818b Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 3 Nov 2021 17:28:12 +0100
Subject: [PATCH 1/3] Fix T91733, T92486: Cycles wrong shadow catcher with
 volumes

Changes:
* After hitting a shadow catcher, re-initialize the volume stack taking
  into account shadow catcher ray visibility. This ensures that volume objects
  are included in the stack only if they are shadow catchers.
* If there is a volume to be shaded in front of the shadow catcher, the split
  is now performed in the shade_volume kernel after volume shading is done.
* Previously the background pass behind a shadow catcher was done as part of
  the regular path, now it is done as part of the shadow catcher path.

For a shadow catcher path with volumes and visible background, operations are
done in this order now:

* intersect_closest
* shade_volume
* shadow catcher split
* intersect_volume_stack
* shade_background
* shade_surface

The world volume is currently assumed to be CG, that is it does not exist in
the footage. We may consider adding an option to control this, or change the
default. With a volume object this control is already possible.

This includes refactoring to centralize the logic for next kernel scheduling
in intersect_closest.h.

Differential Revision: https://developer.blender.org/D13093
---
 .../kernel/integrator/intersect_closest.h     | 272 ++++++++++++------
 .../integrator/intersect_volume_stack.h       |  42 ++-
 .../kernel/integrator/shade_background.h      |  20 +-
 .../cycles/kernel/integrator/shade_volume.h   |  22 +-
 .../cycles/kernel/integrator/shadow_catcher.h |  27 --
 intern/cycles/kernel/integrator/state.h       |   8 +-
 intern/cycles/kernel/integrator/state_util.h  |   8 +-
 7 files changed, 233 insertions(+), 166 deletions(-)

diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index 2cac18ed889..5522b46205b 100644
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -31,7 +31,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-template<uint32_t current_kernel>
 ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
                                                            IntegratorState state,
                                                            const int shader_flags)
@@ -86,36 +85,75 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
   return false;
 }
 
-/* Note that current_kernel is a template value since making this a variable
- * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_shader_next_kernel(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    const int shader,
-    const int shader_flags)
+#ifdef __SHADOW_CATCHER__
+/* Split path if a shadow catcher was hit. */
+ccl_device_forceinline void integrator_split_shadow_catcher(
+    KernelGlobals kg, IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
 {
-  /* Note on scheduling.
-   *
-   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
-   * or without raytrace support, depending on the shader used.
-   *
-   * When there is a shadow catcher split the general idea is to have the following configuration:
-   *
-   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
-   *    will trace shadow catcher object.
-   *
-   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
-   *    the matte ray.
-   *
-   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
-   *    on. The background kernel will then schedule surface shading for the matte ray.
+  /* Test if we hit a shadow catcher object, and potentially split the path to continue tracing two
+   * paths from here. */
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return;
+  }
+
+  /* Mark state as having done a shadow catcher split so that it stops contributing to
+   * the shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Copy current state to new state. */
+  state = integrator_state_shadow_catcher_split(kg, state);
+
+  /* Initialize new state.
    *
    * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
    * the matte path. */
 
-  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
+  /* Mark current state so that it will only track contribution of shadow catcher objects ignoring
+   * non-catcher objects. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
+    /* If using background pass, schedule background shading kernel so that we have a background
+     * to alpha-over on. The background kernel will then continue the path afterwards. */
+    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }
+
+  /* Continue with shading shadow catcher surface. */
+  const int shader = intersection_get_shader(kg, isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+}
+
+/* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Continue with shading shadow catcher surface. Same as integrator_split_shadow_catcher, but
+   * using NEXT instead of INIT. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(kg, state, &isect);
+
+  const int shader = intersection_get_shader(kg, &isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
   if (use_raytrace_kernel) {
     INTEGRATOR_PATH_NEXT_SORTED(
@@ -124,23 +162,132 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
   else {
     INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
   }
+}
 
-#ifdef __SHADOW_CATCHER__
-  const int object_flags = intersection_get_object_flags(kg, isect);
-  if (kernel_shadow_catcher_split(kg, state, object_flags)) {
-    if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
-      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+/* Schedule next kernel to be executed after executing background shader for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Same logic as integrator_split_shadow_catcher, but using NEXT instead of INIT. */
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }
 
-      INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    }
-    else if (use_raytrace_kernel) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  /* Continue with shading shadow catcher surface. */
+  integrator_intersect_next_kernel_after_shadow_catcher_volume<current_kernel>(kg, state);
+}
+#endif
+
+/* Schedule next kernel to be executed after intersect closest.
+ *
+ * Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    const bool hit)
+{
+  /* Continue with volume kernel if we are inside a volume, regardless if we hit anything. */
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate(kg, state, flags)) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
     }
     else {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+      INTEGRATOR_PATH_TERMINATE(current_kernel);
     }
+    return;
   }
 #endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate(kg, state, flags)) {
+        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+        if (use_raytrace_kernel) {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+        }
+        else {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+        }
+
+#ifdef __SHADOW_CATCHER__
+        /* Handle shadow catcher. */
+        integrator_split_shadow_catcher(kg, state, isect);
+#endif
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(current_kernel);
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+}
+
+/* Schedule next kernel to be executed after shade volume.
+ *
+ * The logic here matches integrator_intersect_next_kernel, except that
+ * volume shading and termination testing have already been done. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
+    KernelGlobals kg, IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
+{
+  if (isect->prim != PRIM_NONE) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+      }
+      else {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+      }
+
+#ifdef __SHADOW_CATCHER__
+      /* Handle shadow catcher. */
+      integrator_split_shadow_catcher(kg, state, isect);
+#endif
+      return;
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
 }
 
 ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state)
@@ -192,56 +339,9 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState s
   /* Write intersection result into global integrator state memory. */
   integrator_state_write_isect(kg, state, &isect);
 
-#ifdef __VOLUME__
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
-    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
-
-    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, flags)) {
-      /* Continue with volume kernel if we are inside a volume, regardless
-       * if we hit anything. */
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
-    }
-    else {
-      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-    }
-    return;
-  }
-#endif
-
-  if (hit) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-              kg, state, flags)) {
-        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, &isect, shader, flags);
-        return;
-      }
-      else {
-        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-        return;
-      }
-    }
-  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
+  /* Setup up next kernel to be executed. */
+  integrator_intersect_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+      kg, state, &isect, hit);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h
index dd0587db9d8..9fa5ff63ad2 100644
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -42,10 +42,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
   /* Store to avoid global fetches on every intersection step. */
   const uint volume_stack_size = kernel_data.volume_stack_size;
 
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_ALL_VISIBILITY);
+
 #ifdef __VOLUME_RECORD_ALL__
   Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
   uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
     Intersection *isect = hits;
 
@@ -60,7 +63,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
   Intersection isect;
   int step = 0;
   while (step < 2 * volume_stack_size &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+         scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
     shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
     volume_stack_enter_exit(kg, state, stack_sd);
 
@@ -74,7 +77,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 #endif
 }
 
-ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState state)
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
 
@@ -89,14 +92,20 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
   volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
   volume_ray.t = FLT_MAX;
 
-  const uint visibility = (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_ALL_VISIBILITY);
   int stack_index = 0, enclosed_index = 0;
 
-  /* Write background shader. */
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_CAMERA);
+
+  /* Initialize volume stack with background volume For shadow catcher the
+   * background volume is always assumed to be CG. */
   if (kernel_data.background.volume_shader != SHADER_NONE) {
-    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
-    integrator_state_write_volume_stack(state, stack_index, new_entry);
-    stack_index++;
+    if (!(path_flag & PATH_RAY_SHADOW_CATCHER_PASS)) {
+      INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, stack_index, object) = OBJECT_NONE;
+      INTEGRATOR_STATE_ARRAY_WRITE(
+          state, volume_stack, stack_index, shader) = kernel_data.background.volume_shader;
+      stack_index++;
+    }
   }
 
   /* Store to avoid global fetches on every intersection step. */
@@ -202,9 +211,22 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
   /* Write terminator. */
   const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
   integrator_state_write_volume_stack(state, stack_index, new_entry);
+}
 
-  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
-                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+{
+  integrator_volume_stack_init(kg, state);
+
+  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_PASS) {
+    /* Volume stack re-init for shadow catcher, continue with shading of hit. */
+    integrator_intersect_next_kernel_after_shadow_catcher_volume<
+        DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK>(kg, state);
+  }
+  else {
+    /* Volume stack init for camera rays, continue with intersection of camera ray. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 71a590749bd..24482e85b05 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -192,23 +192,11 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
 
 #ifdef __SHADOW_CATCHER__
   if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    /* Special case for shadow catcher where we want to fill the background pass
+     * behind the shadow catcher but also continue tracing the path. */
     INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-
-    const int isect_prim = INTEGRATOR_STATE(state, isect, prim);
-    const int isect_type = INTEGRATOR_STATE(state, isect, type);
-    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
-    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-
-    if (shader_flags & SD_HAS_RAYTRACE) {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
-                                  shader);
-    }
-    else {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
-                                  shader);
-    }
+    integrator_intersect_next_kernel_after_shadow_catcher_background<
+        DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND>(kg, state);
     return;
   }
 #endif
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index 05959bef220..412be289ebe 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -1023,25 +1023,9 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
   }
   else {
     /* Continue to background, light or surface. */
-    if (isect.prim == PRIM_NONE) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-      return;
-    }
-    else if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
-          kg, state, &isect, shader, flags);
-      return;
-    }
+    integrator_intersect_next_kernel_after_volume<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+        kg, state, &isect);
+    return;
   }
 #endif /* __VOLUME__ */
 }
diff --git a/intern/cycles/kernel/integrator/shadow_catcher.h b/intern/cycles/kernel/integrator/shadow_catcher.h
index 7beae235dbc..ac55678c9cb 100644
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -76,33 +76,6 @@ ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
   return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
 }
 
-/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
- * after this function. */
-ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
-                                                   IntegratorState state,
-                                                   const int object_flags)
-{
-#ifdef __SHADOW_CATCHER__
-
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
-    return false;
-  }
-
-  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
-   * shadow catcher matte pass, but keeps contributing to the combined pass. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
-
-  /* Split new state from the current one. This new state will only track contribution of shadow
-   * catcher objects ignoring non-catcher objects. */
-  integrator_state_shadow_catcher_split(kg, state);
-
-  return true;
-#else
-  (void)object_flags;
-  return false;
-#endif
-}
-
 #ifdef __SHADOW_CATCHER__
 
 ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index 86dac0a65cf..ed2a0be3068 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -173,10 +173,10 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
 
 /* Array access on GPU with Structure-of-Arrays. */
 
-typedef const int IntegratorState;
-typedef const int ConstIntegratorState;
-typedef const int IntegratorShadowState;
-typedef const int ConstIntegratorShadowState;
+typedef int IntegratorState;
+typedef int ConstIntegratorState;
+typedef int IntegratorShadowState;
+typedef int ConstIntegratorShadowState;
 
 #  define INTEGRATOR_STATE_NULL -1
 
diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h
index dafe06e7009..99dae83233c 100644
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -326,8 +326,8 @@ ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,
 
 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
  * after this function. */
-ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
-                                                             IntegratorState state)
+ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGlobals kg,
+                                                                        IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
   ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
@@ -337,14 +337,14 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
 #else
   IntegratorStateCPU *ccl_restrict to_state = state + 1;
 
-  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
+  /* Only copy the required subset for performance. */
   to_state->path = state->path;
   to_state->ray = state->ray;
   to_state->isect = state->isect;
   integrator_state_copy_volume_stack(kg, to_state, state);
 #endif
 
-  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+  return to_state;
 }
 
 #ifdef __KERNEL_CPU__

From 97ff37bf54474efbce39653a1387ad55091d4964 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Fri, 5 Nov 2021 21:01:23 +0100
Subject: [PATCH 2/3] Cycles: perform CPU film reading in the kernel, to use
 AVX2 half conversion

Adds a bunch of CPU kernel function to process on row of pixels, and use those
instead of calling unoptimized implementations.

Fixes T92598
---
 intern/cycles/device/cpu/device_impl.cpp      |   8 +-
 intern/cycles/device/cpu/device_impl.h        |   3 -
 intern/cycles/device/cpu/kernel.cpp           |  19 +++-
 intern/cycles/device/cpu/kernel.h             |  37 ++++++
 intern/cycles/device/device.cpp               |   8 +-
 intern/cycles/device/device.h                 |   2 +-
 .../cycles/integrator/pass_accessor_cpu.cpp   | 106 +++++-------------
 intern/cycles/integrator/pass_accessor_cpu.h  |  32 +++---
 .../cycles/integrator/path_trace_work_cpu.cpp |   2 +-
 intern/cycles/integrator/shader_eval.cpp      |   2 +-
 intern/cycles/kernel/device/cpu/kernel.h      |   1 +
 intern/cycles/kernel/device/cpu/kernel_arch.h |  31 +++++
 .../kernel/device/cpu/kernel_arch_impl.h      |  81 ++++++++++++-
 13 files changed, 220 insertions(+), 112 deletions(-)

diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index d494b40f71d..68dec7f0af2 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
   /* Pick any kernel, all of them are supposed to have same level of microarchitecture
    * optimization. */
-  VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+          << " CPU kernels.";
 
   if (info.cpu_threads == 0) {
     info.cpu_threads = TaskScheduler::num_threads();
@@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
     Device::build_bvh(bvh, progress, refit);
 }
 
-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
-  return &kernels;
-}
-
 void CPUDevice::get_cpu_kernel_thread_globals(
     vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 553728ccc3b..90d217bb624 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,8 +57,6 @@ class CPUDevice : public Device {
   RTCDevice embree_device;
 #endif
 
-  CPUKernels kernels;
-
   CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
   ~CPUDevice();
 
@@ -90,7 +88,6 @@ class CPUDevice : public Device {
 
   void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
 
-  virtual const CPUKernels *get_cpu_kernels() const override;
   virtual void get_cpu_kernel_thread_globals(
       vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
   virtual void *get_cpu_osl_memory() override;
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index 3b253c094fd..91c472d41e8 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
       KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
 
 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))
 
 CPUKernels::CPUKernels()
     : /* Integrator. */
@@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
       REGISTER_KERNEL(adaptive_sampling_filter_x),
       REGISTER_KERNEL(adaptive_sampling_filter_y),
       /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess)
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Film Convert. */
+      REGISTER_KERNEL_FILM_CONVERT(depth),
+      REGISTER_KERNEL_FILM_CONVERT(mist),
+      REGISTER_KERNEL_FILM_CONVERT(sample_count),
+      REGISTER_KERNEL_FILM_CONVERT(float),
+      REGISTER_KERNEL_FILM_CONVERT(light_path),
+      REGISTER_KERNEL_FILM_CONVERT(float3),
+      REGISTER_KERNEL_FILM_CONVERT(motion),
+      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+      REGISTER_KERNEL_FILM_CONVERT(combined),
+      REGISTER_KERNEL_FILM_CONVERT(float4)
 {
 }
 
 #undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 5beeaf148a1..406bd07ab3d 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,11 +17,13 @@
 #pragma once
 
 #include "device/cpu/kernel_function.h"
+#include "util/half.h"
 #include "util/types.h"
 
 CCL_NAMESPACE_BEGIN
 
 struct KernelGlobalsCPU;
+struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;
 
@@ -102,6 +104,41 @@ class CPUKernels {
 
   CryptomattePostprocessFunction cryptomatte_postprocess;
 
+  /* Film Convert. */
+  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                                         const float *buffer,
+                                                         float *pixel,
+                                                         const int width,
+                                                         const int buffer_stride,
+                                                         const int pixel_stride)>;
+  using FilmConvertHalfRGBAFunction =
+      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                 const float *buffer,
+                                 half4 *pixel,
+                                 const int width,
+                                 const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  FilmConvertFunction film_convert_##name; \
+  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+  KERNEL_FILM_CONVERT_FUNCTION(depth)
+  KERNEL_FILM_CONVERT_FUNCTION(mist)
+  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+  KERNEL_FILM_CONVERT_FUNCTION(float)
+
+  KERNEL_FILM_CONVERT_FUNCTION(light_path)
+  KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+  KERNEL_FILM_CONVERT_FUNCTION(motion)
+  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+  KERNEL_FILM_CONVERT_FUNCTION(combined)
+  KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
   CPUKernels();
 };
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 69e959b6f7b..63d0a49d3eb 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,6 +23,7 @@
 #include "device/queue.h"
 
 #include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@@ -363,10 +364,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
   return nullptr;
 }
 
-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
 {
-  LOG(FATAL) << "Device does not support CPU kernels.";
-  return nullptr;
+  /* Initialize CPU kernels once and reuse. */
+  static CPUKernels kernels;
+  return kernels;
 }
 
 void Device::get_cpu_kernel_thread_globals(
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 3cb177adde7..65188459c2c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -180,7 +180,7 @@ class Device {
    * These may not be used on GPU or multi-devices. */
 
   /* Get CPU kernel functions for native instruction set. */
-  virtual const CPUKernels *get_cpu_kernels() const;
+  static const CPUKernels &get_cpu_kernels();
   /* Get kernel globals to pass to kernels. */
   virtual void get_cpu_kernel_thread_globals(
       vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 820da757be0..77ca332d142 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "device/device.h"
+
 #include "integrator/pass_accessor_cpu.h"
 
 #include "session/buffers.h"
+
 #include "util/log.h"
 #include "util/tbb.h"
 
@@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
  * Kernel processing.
  */
 
-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                                           const BufferParams &buffer_params,
-                                                           const Destination &destination,
-                                                           const Processor &processor) const
-{
-  KernelFilmConvert kfilm_convert;
-  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
-  if (destination.pixels) {
-    /* NOTE: No overlays are applied since they are not used for final renders.
-     * Can be supported via some sort of specialization to avoid code duplication. */
-
-    run_get_pass_kernel_processor_float(
-        &kfilm_convert, render_buffers, buffer_params, destination, processor);
-  }
-
-  if (destination.pixels_half_rgba) {
-    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
-    if (destination.num_components == 1) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                float pixel;
-                                                processor(kfilm_convert, buffer, &pixel);
-
-                                                pixel_rgba[0] = pixel;
-                                                pixel_rgba[1] = pixel;
-                                                pixel_rgba[2] = pixel;
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 3) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                processor(kfilm_convert, buffer, pixel_rgba);
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 4) {
-      run_get_pass_kernel_processor_half_rgba(
-          &kfilm_convert, render_buffers, buffer_params, destination, processor);
-    }
-  }
-}
-
-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
     const KernelFilmConvert *kfilm_convert,
     const RenderBuffers *render_buffers,
     const BufferParams &buffer_params,
     const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertFunction func) const
 {
+  /* NOTE: No overlays are applied since they are not used for final renders.
+   * Can be supported via some sort of specialization to avoid code duplication. */
+
   DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
 
   const int64_t pass_stride = buffer_params.pass_stride;
@@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
     const float *buffer = window_data + y * buffer_row_stride;
     float *pixel = destination.pixels +
                    (y * buffer_params.width + destination.offset) * pixel_stride;
-
-    for (int64_t x = 0; x < buffer_params.window_width;
-         ++x, buffer += pass_stride, pixel += pixel_stride) {
-      processor(kfilm_convert, buffer, pixel);
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
   });
 }
 
-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
     const KernelFilmConvert *kfilm_convert,
     const RenderBuffers *render_buffers,
     const BufferParams &buffer_params,
     const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertHalfRGBAFunction func) const
 {
   const int64_t pass_stride = buffer_params.pass_stride;
   const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
   tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
     const float *buffer = window_data + y * buffer_row_stride;
     half4 *pixel = dst_start + y * destination_stride;
-    for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
-
-      float pixel_rgba[4];
-      processor(kfilm_convert, buffer, pixel_rgba);
-
-      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
-
-      *pixel = float4_to_half4_display(
-          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
   });
 }
 
@@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
                                         const BufferParams &buffer_params, \
                                         const Destination &destination) const \
   { \
-    run_get_pass_kernel_processor( \
-        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+    const CPUKernels &kernels = Device::get_cpu_kernels(); \
+    KernelFilmConvert kfilm_convert; \
+    init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
+\
+    if (destination.pixels) { \
+      run_get_pass_kernel_processor_float(&kfilm_convert, \
+                                          render_buffers, \
+                                          buffer_params, \
+                                          destination, \
+                                          kernels.film_convert_##pass); \
+    } \
+\
+    if (destination.pixels_half_rgba) { \
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
+                                              render_buffers, \
+                                              buffer_params, \
+                                              destination, \
+                                              kernels.film_convert_half_rgba_##pass); \
+    } \
   }
 
 /* Float (scalar) passes. */
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
index 0313dc5bb0d..9ed38ab256e 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "device/cpu/kernel.h"
+
 #include "integrator/pass_accessor.h"
 
 CCL_NAMESPACE_BEGIN
@@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor {
   using PassAccessor::PassAccessor;
 
  protected:
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                            const BufferParams &buffer_params,
-                                            const Destination &destination,
-                                            const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_float(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertFunction func) const;
 
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
-                                                  const RenderBuffers *render_buffers,
-                                                  const BufferParams &buffer_params,
-                                                  const Destination &destination,
-                                                  const Processor &processor) const;
-
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
-                                                      const RenderBuffers *render_buffers,
-                                                      const BufferParams &buffer_params,
-                                                      const Destination &destination,
-                                                      const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_half_rgba(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertHalfRGBAFunction func) const;
 
 #define DECLARE_PASS_ACCESSOR(pass) \
   virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 541a7eca02f..36ce2be9f6d 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                    DeviceScene *device_scene,
                                    bool *cancel_requested_flag)
     : PathTraceWork(device, film, device_scene, cancel_requested_flag),
-      kernels_(*(device->get_cpu_kernels()))
+      kernels_(Device::get_cpu_kernels())
 {
   DCHECK_EQ(device->info.type, DEVICE_CPU);
 }
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index 42cbf87f254..9ec530c81df 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
   device->get_cpu_kernel_thread_globals(kernel_thread_globals);
 
   /* Find required kernel function. */
-  const CPUKernels &kernels = *(device->get_cpu_kernels());
+  const CPUKernels &kernels = Device::get_cpu_kernels();
 
   /* Simple parallel_for over all work items. */
   KernelShaderEvalInput *input_data = input.data();
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index c49d7ca445a..6af8094b1ea 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,6 +18,7 @@
 
 /* CPU Kernel Interface */
 
+#include "util/half.h"
 #include "util/types.h"
 
 #include "kernel/types.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 432ac5e15a9..2f9a3f7c59d 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION
 
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                      const float *buffer, \
+                                                      float *pixel, \
+                                                      const int width, \
+                                                      const int buffer_stride, \
+                                                      const int pixel_stride); \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+      const KernelFilmConvert *kfilm_convert, \
+      const float *buffer, \
+      half4 *pixel, \
+      const int width, \
+      const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 /* --------------------------------------------------------------------
  * Shader evaluation.
  */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 6df5d7787fc..1ea5002e300 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -47,8 +47,8 @@
 #    include "kernel/integrator/megakernel.h"
 
 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/read.h"
 #    include "kernel/film/id_passes.h"
+#    include "kernel/film/read.h"
 
 #    include "kernel/bake/bake.h"
 
@@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }
 
+/* --------------------------------------------------------------------
+ * Film Convert.
+ */
+
+#ifdef KERNEL_STUB
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    }
+
+#else
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+      } \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+        if (is_float) { \
+          pixel_rgba[1] = pixel_rgba[0]; \
+          pixel_rgba[2] = pixel_rgba[0]; \
+        } \
+        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+        *pixel = float4_to_half4_display( \
+            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+      } \
+    }
+
+#endif
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL

From 9be49a10699417aa5902144d99ff70e5e1fc6af8 Mon Sep 17 00:00:00 2001
From: Leon Leno <lone_noel>
Date: Fri, 5 Nov 2021 16:32:35 -0500
Subject: [PATCH 3/3] Fix: Property editor icon jittering in some cases

In the tools tab, the tool icon would be offset when it intersected
the bottom of the editor. With some screen resolutions, the icons on
the left side of the editor would also move when intersecting the
bottom of the editor. This happened because of the truncation in
the implicit conversion from float to int. Instead, use explicit
conversion functions.

Differential Revision: https://developer.blender.org/D11097
---
 source/blender/editors/interface/interface_icons.c   | 3 ++-
 source/blender/editors/interface/interface_widgets.c | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/source/blender/editors/interface/interface_icons.c b/source/blender/editors/interface/interface_icons.c
index 5784af90834..c1dd4fcb4e4 100644
--- a/source/blender/editors/interface/interface_icons.c
+++ b/source/blender/editors/interface/interface_icons.c
@@ -1503,7 +1503,8 @@ static void icon_draw_rect(float x,
   int draw_w = w;
   int draw_h = h;
   int draw_x = x;
-  int draw_y = y;
+  /* We need to round y, to avoid the icon jittering in some cases. */
+  int draw_y = round_fl_to_int(y);
 
   /* sanity check */
   if (w <= 0 || h <= 0 || w > 2000 || h > 2000) {
diff --git a/source/blender/editors/interface/interface_widgets.c b/source/blender/editors/interface/interface_widgets.c
index 4b11ed61657..7d1b7b80ccd 100644
--- a/source/blender/editors/interface/interface_widgets.c
+++ b/source/blender/editors/interface/interface_widgets.c
@@ -1407,8 +1407,8 @@ static void widget_draw_icon(
 
     /* force positions to integers, for zoom levels near 1. draws icons crisp. */
     if (aspect > 0.95f && aspect < 1.05f) {
-      xs = (int)(xs + 0.1f);
-      ys = (int)(ys + 0.1f);
+      xs = roundf(xs);
+      ys = roundf(ys);
     }
 
     /* Get theme color. */