1177 lines
44 KiB
C
1177 lines
44 KiB
C
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
/* Common GPU kernels. */
|
|
|
|
#include "kernel/device/gpu/parallel_active_index.h"
|
|
#include "kernel/device/gpu/parallel_prefix_sum.h"
|
|
#include "kernel/device/gpu/parallel_sorted_index.h"
|
|
|
|
#include "kernel/sample/lcg.h"
|
|
|
|
/* Include constant tables before entering Metal's context class scope (context_begin.h) */
|
|
#include "kernel/tables.h"
|
|
|
|
#ifdef __KERNEL_METAL__
|
|
# include "kernel/device/metal/context_begin.h"
|
|
#elif defined(__KERNEL_ONEAPI__)
|
|
# include "kernel/device/oneapi/context_begin.h"
|
|
#endif
|
|
|
|
#include "kernel/device/gpu/work_stealing.h"
|
|
|
|
#include "kernel/integrator/state.h"
|
|
#include "kernel/integrator/state_flow.h"
|
|
#include "kernel/integrator/state_util.h"
|
|
|
|
#include "kernel/integrator/init_from_bake.h"
|
|
#include "kernel/integrator/init_from_camera.h"
|
|
#include "kernel/integrator/intersect_closest.h"
|
|
#include "kernel/integrator/intersect_dedicated_light.h"
|
|
#include "kernel/integrator/intersect_shadow.h"
|
|
#include "kernel/integrator/intersect_subsurface.h"
|
|
#include "kernel/integrator/intersect_volume_stack.h"
|
|
#include "kernel/integrator/shade_background.h"
|
|
#include "kernel/integrator/shade_dedicated_light.h"
|
|
#include "kernel/integrator/shade_light.h"
|
|
#include "kernel/integrator/shade_shadow.h"
|
|
#include "kernel/integrator/shade_surface.h"
|
|
#include "kernel/integrator/shade_volume.h"
|
|
|
|
#include "kernel/bake/bake.h"
|
|
|
|
#include "kernel/film/adaptive_sampling.h"
|
|
|
|
#ifdef __KERNEL_METAL__
|
|
# include "kernel/device/metal/context_end.h"
|
|
#elif defined(__KERNEL_ONEAPI__)
|
|
# include "kernel/device/oneapi/context_end.h"
|
|
#endif
|
|
|
|
#include "kernel/film/read.h"
|
|
|
|
#if defined(__HIPRT__)
|
|
# include "kernel/device/hiprt/hiprt_kernels.h"
|
|
#endif
|
|
/* --------------------------------------------------------------------
|
|
* Integrator.
|
|
*/
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_reset, int num_states)
|
|
{
|
|
const int state = ccl_gpu_global_id_x();
|
|
|
|
if (state < num_states) {
|
|
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
|
|
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_init_from_camera,
|
|
ccl_global KernelWorkTile *tiles,
|
|
const int num_tiles,
|
|
ccl_global float *render_buffer,
|
|
const int max_tile_work_size)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
|
|
if (work_index >= max_tile_work_size * num_tiles) {
|
|
return;
|
|
}
|
|
|
|
const int tile_index = work_index / max_tile_work_size;
|
|
const int tile_work_index = work_index - tile_index * max_tile_work_size;
|
|
|
|
ccl_global const KernelWorkTile *tile = &tiles[tile_index];
|
|
|
|
if (tile_work_index >= tile->work_size) {
|
|
return;
|
|
}
|
|
|
|
const int state = tile->path_index_offset + tile_work_index;
|
|
|
|
uint x, y, sample;
|
|
ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
|
|
|
|
ccl_gpu_kernel_call(
|
|
integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_init_from_bake,
|
|
ccl_global KernelWorkTile *tiles,
|
|
const int num_tiles,
|
|
ccl_global float *render_buffer,
|
|
const int max_tile_work_size)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
|
|
if (work_index >= max_tile_work_size * num_tiles) {
|
|
return;
|
|
}
|
|
|
|
const int tile_index = work_index / max_tile_work_size;
|
|
const int tile_work_index = work_index - tile_index * max_tile_work_size;
|
|
|
|
ccl_global const KernelWorkTile *tile = &tiles[tile_index];
|
|
|
|
if (tile_work_index >= tile->work_size) {
|
|
return;
|
|
}
|
|
|
|
const int state = tile->path_index_offset + tile_work_index;
|
|
|
|
uint x, y, sample;
|
|
ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
|
|
|
|
ccl_gpu_kernel_call(
|
|
integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
#if !defined(__HIPRT__)
|
|
|
|
/* Intersection kernels need access to the kernel handler for specialization constants to work
|
|
* properly. */
|
|
# ifdef __KERNEL_ONEAPI__
|
|
# include "kernel/device/oneapi/context_intersect_begin.h"
|
|
# endif
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_intersect_closest,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_intersect_shadow,
|
|
ccl_global const int *path_index_array,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_intersect_subsurface,
|
|
ccl_global const int *path_index_array,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
|
|
ccl_global const int *path_index_array,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_intersect_dedicated_light,
|
|
ccl_global const int *path_index_array,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_intersect_dedicated_light(NULL, state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
# ifdef __KERNEL_ONEAPI__
|
|
# include "kernel/device/oneapi/context_intersect_end.h"
|
|
# endif
|
|
|
|
#endif
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_background,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_light,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_shadow,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_surface,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
|
|
constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
|
|
#endif
|
|
|
|
#if !defined(__HIPRT__)
|
|
|
|
/* Kernels using intersections need access to the kernel handler for specialization constants to
|
|
* work properly. */
|
|
# ifdef __KERNEL_ONEAPI__
|
|
# include "kernel/device/oneapi/context_intersect_begin.h"
|
|
# endif
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
|
|
# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
|
|
KernelGlobals kg = NULL;
|
|
/* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
|
|
* Dummy offset should not affect result, but somehow fixes bug! */
|
|
kg += __dummy_constant;
|
|
ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
|
|
# else
|
|
ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
|
|
# endif
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_surface_mnee,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
# ifdef __KERNEL_ONEAPI__
|
|
# include "kernel/device/oneapi/context_intersect_end.h"
|
|
# endif
|
|
|
|
#endif
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_volume,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shade_dedicated_light,
|
|
ccl_global const int *path_index_array,
|
|
ccl_global float *render_buffer,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
|
ccl_gpu_kernel_call(integrator_shade_dedicated_light(NULL, state, render_buffer));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_queued_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int kernel_index)
|
|
{
|
|
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
|
|
int kernel_index);
|
|
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
|
|
|
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int kernel_index)
|
|
{
|
|
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
|
|
int kernel_index);
|
|
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
|
|
|
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_active_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices)
|
|
{
|
|
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
|
|
|
|
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_terminated_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int indices_offset)
|
|
{
|
|
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
|
|
|
|
gpu_parallel_active_index_array(
|
|
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int indices_offset)
|
|
{
|
|
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
|
|
|
|
gpu_parallel_active_index_array(
|
|
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_sorted_paths_array,
|
|
int num_states,
|
|
int num_states_limit,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
ccl_global int *key_counter,
|
|
ccl_global int *key_prefix_sum,
|
|
int kernel_index)
|
|
{
|
|
ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
|
|
INTEGRATOR_STATE(state, path, shader_sort_key) :
|
|
GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
|
|
int kernel_index);
|
|
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
|
|
|
const uint state_index = ccl_gpu_global_id_x();
|
|
gpu_parallel_sorted_index_array(state_index,
|
|
num_states,
|
|
num_states_limit,
|
|
indices,
|
|
num_indices,
|
|
key_counter,
|
|
key_prefix_sum,
|
|
ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* oneAPI Verizon needs the local_mem accessor in the arguments. */
|
|
#ifdef __KERNEL_ONEAPI__
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
|
|
int num_states,
|
|
int partition_size,
|
|
int num_states_limit,
|
|
ccl_global int *indices,
|
|
int kernel_index,
|
|
sycl::local_accessor<int> &local_mem)
|
|
#else
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
|
|
int num_states,
|
|
int partition_size,
|
|
int num_states_limit,
|
|
ccl_global int *indices,
|
|
int kernel_index)
|
|
#endif
|
|
{
|
|
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
|
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
|
|
kernel_integrator_state.path.queued_kernel;
|
|
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
|
|
kernel_integrator_state.path.shader_sort_key;
|
|
ccl_global int *key_offsets = (ccl_global int *)
|
|
kernel_integrator_state.sort_partition_key_offsets;
|
|
|
|
# ifdef __KERNEL_METAL__
|
|
int max_shaders = context.launch_params_metal.data.max_shaders;
|
|
# endif
|
|
|
|
# ifdef __KERNEL_ONEAPI__
|
|
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
|
|
* uses metal_*, we need the below to be compatible with these kernels. */
|
|
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
|
|
int metal_local_id = ccl_gpu_thread_idx_x;
|
|
int metal_local_size = ccl_gpu_block_dim_x;
|
|
int metal_grid_id = ccl_gpu_block_idx_x;
|
|
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
|
|
# endif
|
|
|
|
gpu_parallel_sort_bucket_pass(num_states,
|
|
partition_size,
|
|
max_shaders,
|
|
kernel_index,
|
|
d_queued_kernel,
|
|
d_shader_sort_key,
|
|
key_offsets,
|
|
(ccl_gpu_shared int *)threadgroup_array,
|
|
metal_local_id,
|
|
metal_local_size,
|
|
metal_grid_id);
|
|
#endif
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* oneAPI version needs the local_mem accessor in the arguments. */
|
|
#ifdef __KERNEL_ONEAPI__
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_sort_write_pass,
|
|
int num_states,
|
|
int partition_size,
|
|
int num_states_limit,
|
|
ccl_global int *indices,
|
|
int kernel_index,
|
|
sycl::local_accessor<int> &local_mem)
|
|
#else
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_sort_write_pass,
|
|
int num_states,
|
|
int partition_size,
|
|
int num_states_limit,
|
|
ccl_global int *indices,
|
|
int kernel_index)
|
|
#endif
|
|
|
|
{
|
|
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
|
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
|
|
kernel_integrator_state.path.queued_kernel;
|
|
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
|
|
kernel_integrator_state.path.shader_sort_key;
|
|
ccl_global int *key_offsets = (ccl_global int *)
|
|
kernel_integrator_state.sort_partition_key_offsets;
|
|
|
|
# ifdef __KERNEL_METAL__
|
|
int max_shaders = context.launch_params_metal.data.max_shaders;
|
|
# endif
|
|
|
|
# ifdef __KERNEL_ONEAPI__
|
|
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
|
|
* uses metal_*, we need the below to be compatible with these kernels. */
|
|
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
|
|
int metal_local_id = ccl_gpu_thread_idx_x;
|
|
int metal_local_size = ccl_gpu_block_dim_x;
|
|
int metal_grid_id = ccl_gpu_block_idx_x;
|
|
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
|
|
# endif
|
|
|
|
gpu_parallel_sort_write_pass(num_states,
|
|
partition_size,
|
|
max_shaders,
|
|
kernel_index,
|
|
num_states_limit,
|
|
indices,
|
|
d_queued_kernel,
|
|
d_shader_sort_key,
|
|
key_offsets,
|
|
(ccl_gpu_shared int *)threadgroup_array,
|
|
metal_local_id,
|
|
metal_local_size,
|
|
metal_grid_id);
|
|
#endif
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_compact_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int num_active_paths)
|
|
{
|
|
ccl_gpu_kernel_lambda((state >= num_active_paths) &&
|
|
(INTEGRATOR_STATE(state, path, queued_kernel) != 0),
|
|
int num_active_paths);
|
|
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
|
|
|
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_compact_states,
|
|
ccl_global const int *active_terminated_states,
|
|
const int active_states_offset,
|
|
const int terminated_states_offset,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int from_state = active_terminated_states[active_states_offset + global_index];
|
|
const int to_state = active_terminated_states[terminated_states_offset + global_index];
|
|
|
|
ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
|
|
int num_states,
|
|
ccl_global int *indices,
|
|
ccl_global int *num_indices,
|
|
int num_active_paths)
|
|
{
|
|
ccl_gpu_kernel_lambda((state >= num_active_paths) &&
|
|
(INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
|
|
int num_active_paths);
|
|
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
|
|
|
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
|
ccl_gpu_kernel_signature(integrator_compact_shadow_states,
|
|
ccl_global const int *active_terminated_states,
|
|
const int active_states_offset,
|
|
const int terminated_states_offset,
|
|
const int work_size)
|
|
{
|
|
const int global_index = ccl_gpu_global_id_x();
|
|
|
|
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
|
const int from_state = active_terminated_states[active_states_offset + global_index];
|
|
const int to_state = active_terminated_states[terminated_states_offset + global_index];
|
|
|
|
ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
|
|
prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
|
|
{
|
|
gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Adaptive sampling.
|
|
*/
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
|
|
ccl_global float *render_buffer,
|
|
int sx,
|
|
int sy,
|
|
int sw,
|
|
int sh,
|
|
float threshold,
|
|
int reset,
|
|
int offset,
|
|
int stride,
|
|
ccl_global uint *num_active_pixels)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
const int y = work_index / sw;
|
|
const int x = work_index - y * sw;
|
|
|
|
bool converged = true;
|
|
|
|
if (x < sw && y < sh) {
|
|
converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check(
|
|
nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
|
|
}
|
|
|
|
/* NOTE: All threads specified in the mask must execute the intrinsic. */
|
|
const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
|
|
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
|
|
if (lane_id == 0) {
|
|
atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
|
|
ccl_global float *render_buffer,
|
|
int sx,
|
|
int sy,
|
|
int sw,
|
|
int sh,
|
|
int offset,
|
|
int stride)
|
|
{
|
|
const int y = ccl_gpu_global_id_x();
|
|
|
|
if (y < sh) {
|
|
ccl_gpu_kernel_call(
|
|
film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
|
|
ccl_global float *render_buffer,
|
|
int sx,
|
|
int sy,
|
|
int sw,
|
|
int sh,
|
|
int offset,
|
|
int stride)
|
|
{
|
|
const int x = ccl_gpu_global_id_x();
|
|
|
|
if (x < sw) {
|
|
ccl_gpu_kernel_call(
|
|
film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Cryptomatte.
|
|
*/
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(cryptomatte_postprocess,
|
|
ccl_global float *render_buffer,
|
|
int num_pixels)
|
|
{
|
|
const int pixel_index = ccl_gpu_global_id_x();
|
|
|
|
if (pixel_index < num_pixels) {
|
|
ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Film.
|
|
*/
|
|
|
|
ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
|
|
const int rgba_offset,
|
|
const int rgba_stride,
|
|
const int x,
|
|
const int y,
|
|
const half4 half_pixel)
|
|
{
|
|
/* Work around HIP issue with half float display, see #92972. */
|
|
#ifdef __KERNEL_HIP__
|
|
ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
|
|
out[0] = half_pixel.x;
|
|
out[1] = half_pixel.y;
|
|
out[2] = half_pixel.z;
|
|
out[3] = half_pixel.w;
|
|
#else
|
|
ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
|
|
*out = half_pixel;
|
|
#endif
|
|
}
|
|
|
|
#ifdef __KERNEL_METAL__
|
|
|
|
/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
|
|
* film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
|
|
# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
|
|
float local_pixel[4]; \
|
|
film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
|
|
if (input_channel_count >= 1) { \
|
|
pixel[0] = local_pixel[0]; \
|
|
} \
|
|
if (input_channel_count >= 2) { \
|
|
pixel[1] = local_pixel[1]; \
|
|
} \
|
|
if (input_channel_count >= 3) { \
|
|
pixel[2] = local_pixel[2]; \
|
|
} \
|
|
if (input_channel_count >= 4) { \
|
|
pixel[3] = local_pixel[3]; \
|
|
}
|
|
|
|
#else
|
|
|
|
# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
|
|
film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
|
|
|
|
#endif
|
|
|
|
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
|
|
ccl_gpu_kernel_signature(film_convert_##variant, \
|
|
const KernelFilmConvert kfilm_convert, \
|
|
ccl_global float *pixels, \
|
|
ccl_global float *render_buffer, \
|
|
int num_pixels, \
|
|
int width, \
|
|
int offset, \
|
|
int stride, \
|
|
int rgba_offset, \
|
|
int rgba_stride) \
|
|
{ \
|
|
const int render_pixel_index = ccl_gpu_global_id_x(); \
|
|
if (render_pixel_index >= num_pixels) { \
|
|
return; \
|
|
} \
|
|
\
|
|
const int x = render_pixel_index % width; \
|
|
const int y = render_pixel_index / width; \
|
|
\
|
|
const uint64_t buffer_pixel_index = x + y * stride; \
|
|
ccl_global const float *buffer = render_buffer + offset + \
|
|
buffer_pixel_index * kfilm_convert.pass_stride; \
|
|
\
|
|
ccl_global float *pixel = pixels + \
|
|
(render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
|
|
\
|
|
FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
|
|
} \
|
|
ccl_gpu_kernel_postfix \
|
|
\
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
|
|
ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
|
|
const KernelFilmConvert kfilm_convert, \
|
|
ccl_global uchar4 *rgba, \
|
|
ccl_global float *render_buffer, \
|
|
int num_pixels, \
|
|
int width, \
|
|
int offset, \
|
|
int stride, \
|
|
int rgba_offset, \
|
|
int rgba_stride) \
|
|
{ \
|
|
const int render_pixel_index = ccl_gpu_global_id_x(); \
|
|
if (render_pixel_index >= num_pixels) { \
|
|
return; \
|
|
} \
|
|
\
|
|
const int x = render_pixel_index % width; \
|
|
const int y = render_pixel_index / width; \
|
|
\
|
|
const uint64_t buffer_pixel_index = x + y * stride; \
|
|
ccl_global const float *buffer = render_buffer + offset + \
|
|
buffer_pixel_index * kfilm_convert.pass_stride; \
|
|
\
|
|
float pixel[4]; \
|
|
film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
|
|
\
|
|
if (input_channel_count == 1) { \
|
|
pixel[1] = pixel[2] = pixel[0]; \
|
|
} \
|
|
if (input_channel_count <= 3) { \
|
|
pixel[3] = 1.0f; \
|
|
} \
|
|
\
|
|
film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
|
|
\
|
|
const half4 half_pixel = float4_to_half4_display( \
|
|
make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
|
|
kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
|
|
} \
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* 1 channel inputs */
|
|
KERNEL_FILM_CONVERT_VARIANT(depth, 1)
|
|
KERNEL_FILM_CONVERT_VARIANT(mist, 1)
|
|
KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
|
|
KERNEL_FILM_CONVERT_VARIANT(float, 1)
|
|
|
|
/* 3 channel inputs */
|
|
KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
|
|
KERNEL_FILM_CONVERT_VARIANT(float3, 3)
|
|
|
|
/* 4 channel inputs */
|
|
KERNEL_FILM_CONVERT_VARIANT(motion, 4)
|
|
KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
|
|
KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
|
|
KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
|
|
KERNEL_FILM_CONVERT_VARIANT(combined, 4)
|
|
KERNEL_FILM_CONVERT_VARIANT(float4, 4)
|
|
|
|
#undef KERNEL_FILM_CONVERT_VARIANT
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Shader evaluation.
|
|
*/
|
|
|
|
/* Displacement */
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(shader_eval_displace,
|
|
ccl_global KernelShaderEvalInput *input,
|
|
ccl_global float *output,
|
|
const int offset,
|
|
const int work_size)
|
|
{
|
|
int i = ccl_gpu_global_id_x();
|
|
if (i < work_size) {
|
|
ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* Background */
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(shader_eval_background,
|
|
ccl_global KernelShaderEvalInput *input,
|
|
ccl_global float *output,
|
|
const int offset,
|
|
const int work_size)
|
|
{
|
|
int i = ccl_gpu_global_id_x();
|
|
if (i < work_size) {
|
|
ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* Curve Shadow Transparency */
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
|
|
ccl_global KernelShaderEvalInput *input,
|
|
ccl_global float *output,
|
|
const int offset,
|
|
const int work_size)
|
|
{
|
|
int i = ccl_gpu_global_id_x();
|
|
if (i < work_size) {
|
|
ccl_gpu_kernel_call(
|
|
kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Denoising.
|
|
*/
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(filter_color_preprocess,
|
|
ccl_global float *render_buffer,
|
|
int full_x,
|
|
int full_y,
|
|
int width,
|
|
int height,
|
|
int offset,
|
|
int stride,
|
|
int pass_stride,
|
|
int pass_denoised)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
const int y = work_index / width;
|
|
const int x = work_index - y * width;
|
|
|
|
if (x >= width || y >= height) {
|
|
return;
|
|
}
|
|
|
|
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
|
|
ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
|
|
|
|
ccl_global float *color_out = buffer + pass_denoised;
|
|
color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
|
|
color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
|
|
color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(filter_guiding_preprocess,
|
|
ccl_global float *guiding_buffer,
|
|
int guiding_pass_stride,
|
|
int guiding_pass_albedo,
|
|
int guiding_pass_normal,
|
|
int guiding_pass_flow,
|
|
ccl_global const float *render_buffer,
|
|
int render_offset,
|
|
int render_stride,
|
|
int render_pass_stride,
|
|
int render_pass_sample_count,
|
|
int render_pass_denoising_albedo,
|
|
int render_pass_denoising_normal,
|
|
int render_pass_motion,
|
|
int full_x,
|
|
int full_y,
|
|
int width,
|
|
int height,
|
|
int num_samples)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
const int y = work_index / width;
|
|
const int x = work_index - y * width;
|
|
|
|
if (x >= width || y >= height) {
|
|
return;
|
|
}
|
|
|
|
const uint64_t guiding_pixel_index = x + y * width;
|
|
ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
|
|
|
|
const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
|
|
ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
|
|
|
|
float pixel_scale;
|
|
if (render_pass_sample_count == PASS_UNUSED) {
|
|
pixel_scale = 1.0f / num_samples;
|
|
}
|
|
else {
|
|
pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
|
|
}
|
|
|
|
/* Albedo pass. */
|
|
if (guiding_pass_albedo != PASS_UNUSED) {
|
|
kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
|
|
|
|
ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
|
|
ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
|
|
|
|
albedo_out[0] = aledo_in[0] * pixel_scale;
|
|
albedo_out[1] = aledo_in[1] * pixel_scale;
|
|
albedo_out[2] = aledo_in[2] * pixel_scale;
|
|
}
|
|
|
|
/* Normal pass. */
|
|
if (guiding_pass_normal != PASS_UNUSED) {
|
|
kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
|
|
|
|
ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
|
|
ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
|
|
|
|
normal_out[0] = normal_in[0] * pixel_scale;
|
|
normal_out[1] = normal_in[1] * pixel_scale;
|
|
normal_out[2] = normal_in[2] * pixel_scale;
|
|
}
|
|
|
|
/* Flow pass. */
|
|
if (guiding_pass_flow != PASS_UNUSED) {
|
|
kernel_assert(render_pass_motion != PASS_UNUSED);
|
|
|
|
ccl_global const float *motion_in = buffer + render_pass_motion;
|
|
ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
|
|
|
|
flow_out[0] = -motion_in[0] * pixel_scale;
|
|
flow_out[1] = -motion_in[1] * pixel_scale;
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
|
|
ccl_global float *guiding_buffer,
|
|
int guiding_pass_stride,
|
|
int guiding_pass_albedo,
|
|
int width,
|
|
int height)
|
|
{
|
|
kernel_assert(guiding_pass_albedo != PASS_UNUSED);
|
|
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
const int y = work_index / width;
|
|
const int x = work_index - y * width;
|
|
|
|
if (x >= width || y >= height) {
|
|
return;
|
|
}
|
|
|
|
const uint64_t guiding_pixel_index = x + y * width;
|
|
ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
|
|
|
|
ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
|
|
|
|
albedo_out[0] = 0.5f;
|
|
albedo_out[1] = 0.5f;
|
|
albedo_out[2] = 0.5f;
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(filter_color_postprocess,
|
|
ccl_global float *render_buffer,
|
|
int full_x,
|
|
int full_y,
|
|
int width,
|
|
int height,
|
|
int offset,
|
|
int stride,
|
|
int pass_stride,
|
|
int num_samples,
|
|
int pass_noisy,
|
|
int pass_denoised,
|
|
int pass_sample_count,
|
|
int num_components,
|
|
int use_compositing)
|
|
{
|
|
const int work_index = ccl_gpu_global_id_x();
|
|
const int y = work_index / width;
|
|
const int x = work_index - y * width;
|
|
|
|
if (x >= width || y >= height) {
|
|
return;
|
|
}
|
|
|
|
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
|
|
ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
|
|
|
|
float pixel_scale;
|
|
if (pass_sample_count == PASS_UNUSED) {
|
|
pixel_scale = num_samples;
|
|
}
|
|
else {
|
|
pixel_scale = __float_as_uint(buffer[pass_sample_count]);
|
|
}
|
|
|
|
ccl_global float *denoised_pixel = buffer + pass_denoised;
|
|
|
|
denoised_pixel[0] *= pixel_scale;
|
|
denoised_pixel[1] *= pixel_scale;
|
|
denoised_pixel[2] *= pixel_scale;
|
|
|
|
if (num_components == 3) {
|
|
/* Pass without alpha channel. */
|
|
}
|
|
else if (!use_compositing) {
|
|
/* Currently compositing passes are either 3-component (derived by dividing light passes)
|
|
* or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
|
|
* simplifies logic and avoids extra memory allocation. */
|
|
ccl_global const float *noisy_pixel = buffer + pass_noisy;
|
|
denoised_pixel[3] = noisy_pixel[3];
|
|
}
|
|
else {
|
|
/* Assigning to zero since this is a default alpha value for 3-component passes, and it
|
|
* is an opaque pixel for 4 component passes. */
|
|
denoised_pixel[3] = 0;
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|
|
|
|
/* --------------------------------------------------------------------
|
|
* Shadow catcher.
|
|
*/
|
|
|
|
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
|
ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
|
|
int num_states,
|
|
ccl_global uint *num_possible_splits)
|
|
{
|
|
const int state = ccl_gpu_global_id_x();
|
|
|
|
bool can_split = false;
|
|
|
|
if (state < num_states) {
|
|
can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
|
|
}
|
|
|
|
/* NOTE: All threads specified in the mask must execute the intrinsic. */
|
|
const auto can_split_mask = ccl_gpu_ballot(can_split);
|
|
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
|
|
if (lane_id == 0) {
|
|
atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
|
|
}
|
|
}
|
|
ccl_gpu_kernel_postfix
|