Cycles: Metal support for OpenImageDenoise

This is supported on Apple Silicon GPUs and macOS 13.0+.

Co-authored-by: Stefan Werner <stefan.werner@intel.com>
Co-authored-by: Attila Afra <attila.t.afra@intel.com>
Pull Request: https://projects.blender.org/blender/blender/pulls/116124
This commit is contained in:
Stefan Werner 2024-02-06 21:13:23 +01:00 committed by Brecht Van Lommel
parent 691584da1b
commit 31d55e87f9
15 changed files with 213 additions and 95 deletions

View File

@ -9,7 +9,12 @@ set(OIDN_EXTRA_ARGS
-DOIDN_FILTER_RTLIGHTMAP=OFF
-DPython_EXECUTABLE=${PYTHON_BINARY}
)
if(NOT APPLE)
if(APPLE)
set(OIDN_EXTRA_ARGS
${OIDN_EXTRA_ARGS}
-DOIDN_DEVICE_METAL=ON
)
else()
set(OIDN_EXTRA_ARGS
${OIDN_EXTRA_ARGS}
-DOIDN_DEVICE_SYCL=ON

View File

@ -257,6 +257,12 @@ class Device {
return false;
}
/* Returns native buffer handle for device pointer. */
virtual void *get_native_buffer(device_ptr /*ptr*/)
{
return nullptr;
}
/* Guiding */
/* Returns path guiding device handle. */

View File

@ -6,6 +6,7 @@
# include "device/metal/device.h"
# include "device/metal/device_impl.h"
# include "integrator/denoiser_oidn_gpu.h"
#endif
@ -55,6 +56,11 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.display_device = true;
info.denoisers = DENOISER_NONE;
info.id = id;
# if defined(WITH_OPENIMAGEDENOISE)
if (OIDNDenoiserGPU::is_device_supported(info)) {
info.denoisers |= DENOISER_OPENIMAGEDENOISE;
}
# endif
MetalGPUVendor vendor = MetalInfo::get_device_vendor(device);

View File

@ -134,6 +134,8 @@ class MetalDevice : public Device {
virtual bool should_use_graphics_interop() override;
virtual void *get_native_buffer(device_ptr ptr) override;
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

View File

@ -1389,6 +1389,11 @@ bool MetalDevice::should_use_graphics_interop()
return false;
}
void *MetalDevice::get_native_buffer(device_ptr ptr)
{
return ((MetalMem *)ptr)->mtlBuffer;
}
void MetalDevice::flush_delayed_free_list()
{
/* free any Metal buffers that may have been freed by host while a command

View File

@ -40,6 +40,8 @@ class MetalDeviceQueue : public DeviceQueue {
virtual void copy_to_device(device_memory &mem) override;
virtual void copy_from_device(device_memory &mem) override;
virtual void *native_queue() override;
protected:
void setup_capture();
void update_capture(DeviceKernel kernel);

View File

@ -979,6 +979,11 @@ void MetalDeviceQueue::close_blit_encoder()
}
}
void *MetalDeviceQueue::native_queue()
{
return mtlCommandQueue_;
}
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@ -157,6 +157,11 @@ class DeviceQueue {
/* Device this queue has been created for. */
Device *device;
virtual void *native_queue()
{
return nullptr;
}
protected:
/* Hide construction so that allocation via `Device` API is enforced. */
explicit DeviceQueue(Device *device);

View File

@ -324,9 +324,9 @@ void DenoiserGPU::denoise_color_read(const DenoiseContext &context, const Denois
denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples);
PassAccessor::Destination destination(pass_access_info.type);
destination.d_pixels = context.render_buffers->buffer.device_pointer +
pass.denoised_offset * sizeof(float);
destination.d_pixels = context.render_buffers->buffer.device_pointer;
destination.num_components = 3;
destination.pixel_offset = pass.denoised_offset;
destination.pixel_stride = context.buffer_params.pass_stride;
BufferParams buffer_params = context.buffer_params;

View File

@ -47,6 +47,20 @@ bool OIDNDenoiserGPU::is_device_supported(const DeviceInfo &device)
case DEVICE_OPTIX:
device_type = OIDN_DEVICE_TYPE_CUDA;
break;
# endif
# ifdef OIDN_DEVICE_METAL
case DEVICE_METAL: {
int num_devices = oidnGetNumPhysicalDevices();
for (int i = 0; i < num_devices; i++) {
if (oidnGetPhysicalDeviceUInt(i, "type") == OIDN_DEVICE_TYPE_METAL) {
const char *name = oidnGetPhysicalDeviceString(i, "name");
if (device.id.find(name) != std::string::npos) {
return true;
}
}
}
return false;
}
# endif
case DEVICE_CPU:
/* This is the GPU denoiser - CPU devices shouldn't end up here. */
@ -111,6 +125,9 @@ uint OIDNDenoiserGPU::get_device_type_mask() const
# ifdef OIDN_DEVICE_SYCL
device_mask |= DEVICE_MASK_ONEAPI;
# endif
# ifdef OIDN_DEVICE_METAL
device_mask |= DEVICE_MASK_METAL;
# endif
# ifdef OIDN_DEVICE_CUDA
device_mask |= DEVICE_MASK_CUDA;
device_mask |= DEVICE_MASK_OPTIX;
@ -162,6 +179,13 @@ bool OIDNDenoiserGPU::denoise_create_if_needed(DenoiseContext &context)
1);
break;
# endif
# if defined(OIDN_DEVICE_METAL) && defined(WITH_METAL)
case DEVICE_METAL: {
denoiser_queue_->init_execution();
const MTLCommandQueue_id queue = (const MTLCommandQueue_id)denoiser_queue_->native_queue();
oidn_device_ = oidnNewMetalDevice(&queue, 1);
} break;
# endif
# if defined(OIDN_DEVICE_CUDA) && defined(WITH_CUDA)
case DEVICE_CUDA:
case DEVICE_OPTIX: {
@ -262,24 +286,24 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa
/* Color pass. */
const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
oidnSetSharedFilterImage(oidn_filter_,
"color",
(void *)context.render_buffers->buffer.device_pointer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
pass.denoised_offset * sizeof(float),
pass_stride_in_bytes,
pass_stride_in_bytes * context.buffer_params.stride);
oidnSetSharedFilterImage(oidn_filter_,
"output",
(void *)context.render_buffers->buffer.device_pointer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
pass.denoised_offset * sizeof(float),
pass_stride_in_bytes,
pass_stride_in_bytes * context.buffer_params.stride);
set_filter_pass(oidn_filter_,
"color",
context.render_buffers->buffer.device_pointer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
pass.denoised_offset * sizeof(float),
pass_stride_in_bytes,
pass_stride_in_bytes * context.buffer_params.stride);
set_filter_pass(oidn_filter_,
"output",
context.render_buffers->buffer.device_pointer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
pass.denoised_offset * sizeof(float),
pass_stride_in_bytes,
pass_stride_in_bytes * context.buffer_params.stride);
/* Optional albedo and color passes. */
if (context.num_input_passes > 1) {
@ -289,95 +313,95 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa
if (context.use_pass_albedo) {
if (params_.prefilter == DENOISER_PREFILTER_NONE) {
oidnSetSharedFilterImage(oidn_filter_,
"albedo",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(oidn_filter_,
"albedo",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
}
else {
oidnSetSharedFilterImage(albedo_filter_,
"color",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
oidnSetSharedFilterImage(albedo_filter_,
"output",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(albedo_filter_,
"color",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(albedo_filter_,
"output",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
oidnCommitFilter(albedo_filter_);
oidnExecuteFilterAsync(albedo_filter_);
oidnSetSharedFilterImage(oidn_filter_,
"albedo",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(oidn_filter_,
"albedo",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_albedo * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
}
}
if (context.use_pass_normal) {
if (params_.prefilter == DENOISER_PREFILTER_NONE) {
oidnSetSharedFilterImage(oidn_filter_,
"normal",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(oidn_filter_,
"normal",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
}
else {
oidnSetSharedFilterImage(normal_filter_,
"color",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(normal_filter_,
"color",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
oidnSetSharedFilterImage(normal_filter_,
"output",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(normal_filter_,
"output",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
oidnCommitFilter(normal_filter_);
oidnExecuteFilterAsync(normal_filter_);
oidnSetSharedFilterImage(oidn_filter_,
"normal",
(void *)d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
set_filter_pass(oidn_filter_,
"normal",
d_guiding_buffer,
OIDN_FORMAT_FLOAT3,
context.buffer_params.width,
context.buffer_params.height,
context.guiding_params.pass_normal * sizeof(float),
pixel_stride_in_bytes,
row_stride_in_bytes);
}
}
}
@ -409,6 +433,48 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa
return true;
}
void OIDNDenoiserGPU::set_filter_pass(OIDNFilter filter,
const char *name,
device_ptr ptr,
int format,
int width,
int height,
size_t offset_in_bytes,
size_t pixel_stride_in_bytes,
size_t row_stride_in_bytes)
{
# if defined(OIDN_DEVICE_METAL) && defined(WITH_METAL)
if (denoiser_device_->info.type == DEVICE_METAL) {
void *mtl_buffer = denoiser_device_->get_native_buffer(ptr);
OIDNBuffer oidn_buffer = oidnNewSharedBufferFromMetal(oidn_device_, mtl_buffer);
oidnSetFilterImage(filter,
name,
oidn_buffer,
(OIDNFormat)format,
width,
height,
offset_in_bytes,
pixel_stride_in_bytes,
row_stride_in_bytes);
oidnReleaseBuffer(oidn_buffer);
}
else
# endif
{
oidnSetSharedFilterImage(filter,
name,
(void *)ptr,
(OIDNFormat)format,
width,
height,
offset_in_bytes,
pixel_stride_in_bytes,
row_stride_in_bytes);
}
}
CCL_NAMESPACE_END
#endif

View File

@ -51,6 +51,16 @@ class OIDNDenoiserGPU : public DenoiserGPU {
OIDNFilter create_filter();
void set_filter_pass(OIDNFilter filter,
const char *name,
device_ptr ptr,
int format,
int width,
int height,
size_t offset_in_bytes,
size_t pixel_stride_in_bytes,
size_t row_stride_in_bytes);
OIDNDevice oidn_device_ = nullptr;
OIDNFilter oidn_filter_ = nullptr;
OIDNFilter albedo_filter_ = nullptr;

View File

@ -69,6 +69,10 @@ class PassAccessor {
* Allows to get pixels of render buffer into a partial slice of the destination. */
int offset = 0;
/* Offset in floats from the beginning of pixels storage.
* Is ignored for half4 destination. */
int pixel_offset = 0;
/* Number of floats per pixel. When zero is the same as `num_components`.
*
* NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component

View File

@ -47,7 +47,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
parallel_for(0, buffer_params.window_height, [&](int64_t y) {
const float *buffer = window_data + y * buffer_row_stride;
float *pixel = destination.pixels +
float *pixel = destination.pixels + destination.pixel_offset +
(y * buffer_params.width + destination.offset) * pixel_stride;
func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
});

View File

@ -48,6 +48,7 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
&buffer_params.window_width,
&offset,
&buffer_params.stride,
&destination.pixel_offset,
&destination.offset,
&destination_stride);

View File

@ -809,6 +809,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
int width, \
int offset, \
int stride, \
int channel_offset, \
int rgba_offset, \
int rgba_stride) \
{ \
@ -824,7 +825,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
ccl_global const float *buffer = render_buffer + offset + \
buffer_pixel_index * kfilm_convert.pass_stride; \
\
ccl_global float *pixel = pixels + \
ccl_global float *pixel = pixels + channel_offset + \
(render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
\
FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \