diff --git a/build_files/build_environment/cmake/openimagedenoise.cmake b/build_files/build_environment/cmake/openimagedenoise.cmake index 68c248a4351..03ca17a7abc 100644 --- a/build_files/build_environment/cmake/openimagedenoise.cmake +++ b/build_files/build_environment/cmake/openimagedenoise.cmake @@ -9,7 +9,12 @@ set(OIDN_EXTRA_ARGS -DOIDN_FILTER_RTLIGHTMAP=OFF -DPython_EXECUTABLE=${PYTHON_BINARY} ) -if(NOT APPLE) +if(APPLE) + set(OIDN_EXTRA_ARGS + ${OIDN_EXTRA_ARGS} + -DOIDN_DEVICE_METAL=ON + ) +else() set(OIDN_EXTRA_ARGS ${OIDN_EXTRA_ARGS} -DOIDN_DEVICE_SYCL=ON diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index b499058ce1a..deefe0404d2 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -257,6 +257,12 @@ class Device { return false; } + /* Returns native buffer handle for device pointer. */ + virtual void *get_native_buffer(device_ptr /*ptr*/) + { + return nullptr; + } + /* Guiding */ /* Returns path guiding device handle. */ diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index e8c03a538f6..5b8b40efbf2 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -6,6 +6,7 @@ # include "device/metal/device.h" # include "device/metal/device_impl.h" +# include "integrator/denoiser_oidn_gpu.h" #endif @@ -55,6 +56,11 @@ void device_metal_info(vector &devices) info.display_device = true; info.denoisers = DENOISER_NONE; info.id = id; +# if defined(WITH_OPENIMAGEDENOISE) + if (OIDNDenoiserGPU::is_device_supported(info)) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } +# endif MetalGPUVendor vendor = MetalInfo::get_device_vendor(device); diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index ec708e08bd7..04a862d7785 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -134,6 +134,8 @@ class MetalDevice : public Device { virtual bool should_use_graphics_interop() override; + virtual void *get_native_buffer(device_ptr ptr) override; + virtual unique_ptr gpu_queue_create() override; virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 61283a62034..c60917422ba 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -1389,6 +1389,11 @@ bool MetalDevice::should_use_graphics_interop() return false; } +void *MetalDevice::get_native_buffer(device_ptr ptr) +{ + return ((MetalMem *)ptr)->mtlBuffer; +} + void MetalDevice::flush_delayed_free_list() { /* free any Metal buffers that may have been freed by host while a command diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 2d711aaacca..b7aebb3c4a7 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -40,6 +40,8 @@ class MetalDeviceQueue : public DeviceQueue { virtual void copy_to_device(device_memory &mem) override; virtual void copy_from_device(device_memory &mem) override; + virtual void *native_queue() override; + protected: void setup_capture(); void update_capture(DeviceKernel kernel); diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 9d3195beaa6..067e7f3dba9 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -979,6 +979,11 @@ void MetalDeviceQueue::close_blit_encoder() } } +void *MetalDeviceQueue::native_queue() +{ + return mtlCommandQueue_; +} + CCL_NAMESPACE_END #endif /* WITH_METAL */ diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 190e1c77d20..1394ec8d85f 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -157,6 +157,11 @@ class DeviceQueue { /* Device this queue has been created for. */ Device *device; + virtual void *native_queue() + { + return nullptr; + } + protected: /* Hide construction so that allocation via `Device` API is enforced. */ explicit DeviceQueue(Device *device); diff --git a/intern/cycles/integrator/denoiser_gpu.cpp b/intern/cycles/integrator/denoiser_gpu.cpp index 501d124b4a6..4203990d18a 100644 --- a/intern/cycles/integrator/denoiser_gpu.cpp +++ b/intern/cycles/integrator/denoiser_gpu.cpp @@ -324,9 +324,9 @@ void DenoiserGPU::denoise_color_read(const DenoiseContext &context, const Denois denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples); PassAccessor::Destination destination(pass_access_info.type); - destination.d_pixels = context.render_buffers->buffer.device_pointer + - pass.denoised_offset * sizeof(float); + destination.d_pixels = context.render_buffers->buffer.device_pointer; destination.num_components = 3; + destination.pixel_offset = pass.denoised_offset; destination.pixel_stride = context.buffer_params.pass_stride; BufferParams buffer_params = context.buffer_params; diff --git a/intern/cycles/integrator/denoiser_oidn_gpu.cpp b/intern/cycles/integrator/denoiser_oidn_gpu.cpp index 948094d1046..484b17b7fdc 100644 --- a/intern/cycles/integrator/denoiser_oidn_gpu.cpp +++ b/intern/cycles/integrator/denoiser_oidn_gpu.cpp @@ -47,6 +47,20 @@ bool OIDNDenoiserGPU::is_device_supported(const DeviceInfo &device) case DEVICE_OPTIX: device_type = OIDN_DEVICE_TYPE_CUDA; break; +# endif +# ifdef OIDN_DEVICE_METAL + case DEVICE_METAL: { + int num_devices = oidnGetNumPhysicalDevices(); + for (int i = 0; i < num_devices; i++) { + if (oidnGetPhysicalDeviceUInt(i, "type") == OIDN_DEVICE_TYPE_METAL) { + const char *name = oidnGetPhysicalDeviceString(i, "name"); + if (device.id.find(name) != std::string::npos) { + return true; + } + } + } + return false; + } # endif case DEVICE_CPU: /* This is the GPU denoiser - CPU devices shouldn't end up here. */ @@ -111,6 +125,9 @@ uint OIDNDenoiserGPU::get_device_type_mask() const # ifdef OIDN_DEVICE_SYCL device_mask |= DEVICE_MASK_ONEAPI; # endif +# ifdef OIDN_DEVICE_METAL + device_mask |= DEVICE_MASK_METAL; +# endif # ifdef OIDN_DEVICE_CUDA device_mask |= DEVICE_MASK_CUDA; device_mask |= DEVICE_MASK_OPTIX; @@ -162,6 +179,13 @@ bool OIDNDenoiserGPU::denoise_create_if_needed(DenoiseContext &context) 1); break; # endif +# if defined(OIDN_DEVICE_METAL) && defined(WITH_METAL) + case DEVICE_METAL: { + denoiser_queue_->init_execution(); + const MTLCommandQueue_id queue = (const MTLCommandQueue_id)denoiser_queue_->native_queue(); + oidn_device_ = oidnNewMetalDevice(&queue, 1); + } break; +# endif # if defined(OIDN_DEVICE_CUDA) && defined(WITH_CUDA) case DEVICE_CUDA: case DEVICE_OPTIX: { @@ -262,24 +286,24 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa /* Color pass. */ const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float); - oidnSetSharedFilterImage(oidn_filter_, - "color", - (void *)context.render_buffers->buffer.device_pointer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - pass.denoised_offset * sizeof(float), - pass_stride_in_bytes, - pass_stride_in_bytes * context.buffer_params.stride); - oidnSetSharedFilterImage(oidn_filter_, - "output", - (void *)context.render_buffers->buffer.device_pointer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - pass.denoised_offset * sizeof(float), - pass_stride_in_bytes, - pass_stride_in_bytes * context.buffer_params.stride); + set_filter_pass(oidn_filter_, + "color", + context.render_buffers->buffer.device_pointer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + pass.denoised_offset * sizeof(float), + pass_stride_in_bytes, + pass_stride_in_bytes * context.buffer_params.stride); + set_filter_pass(oidn_filter_, + "output", + context.render_buffers->buffer.device_pointer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + pass.denoised_offset * sizeof(float), + pass_stride_in_bytes, + pass_stride_in_bytes * context.buffer_params.stride); /* Optional albedo and color passes. */ if (context.num_input_passes > 1) { @@ -289,95 +313,95 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa if (context.use_pass_albedo) { if (params_.prefilter == DENOISER_PREFILTER_NONE) { - oidnSetSharedFilterImage(oidn_filter_, - "albedo", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_albedo * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(oidn_filter_, + "albedo", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_albedo * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); } else { - oidnSetSharedFilterImage(albedo_filter_, - "color", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_albedo * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); - oidnSetSharedFilterImage(albedo_filter_, - "output", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_albedo * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(albedo_filter_, + "color", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_albedo * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); + set_filter_pass(albedo_filter_, + "output", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_albedo * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); oidnCommitFilter(albedo_filter_); oidnExecuteFilterAsync(albedo_filter_); - oidnSetSharedFilterImage(oidn_filter_, - "albedo", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_albedo * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(oidn_filter_, + "albedo", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_albedo * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); } } if (context.use_pass_normal) { if (params_.prefilter == DENOISER_PREFILTER_NONE) { - oidnSetSharedFilterImage(oidn_filter_, - "normal", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_normal * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(oidn_filter_, + "normal", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_normal * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); } else { - oidnSetSharedFilterImage(normal_filter_, - "color", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_normal * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(normal_filter_, + "color", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_normal * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); - oidnSetSharedFilterImage(normal_filter_, - "output", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_normal * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(normal_filter_, + "output", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_normal * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); oidnCommitFilter(normal_filter_); oidnExecuteFilterAsync(normal_filter_); - oidnSetSharedFilterImage(oidn_filter_, - "normal", - (void *)d_guiding_buffer, - OIDN_FORMAT_FLOAT3, - context.buffer_params.width, - context.buffer_params.height, - context.guiding_params.pass_normal * sizeof(float), - pixel_stride_in_bytes, - row_stride_in_bytes); + set_filter_pass(oidn_filter_, + "normal", + d_guiding_buffer, + OIDN_FORMAT_FLOAT3, + context.buffer_params.width, + context.buffer_params.height, + context.guiding_params.pass_normal * sizeof(float), + pixel_stride_in_bytes, + row_stride_in_bytes); } } } @@ -409,6 +433,48 @@ bool OIDNDenoiserGPU::denoise_run(const DenoiseContext &context, const DenoisePa return true; } +void OIDNDenoiserGPU::set_filter_pass(OIDNFilter filter, + const char *name, + device_ptr ptr, + int format, + int width, + int height, + size_t offset_in_bytes, + size_t pixel_stride_in_bytes, + size_t row_stride_in_bytes) +{ +# if defined(OIDN_DEVICE_METAL) && defined(WITH_METAL) + if (denoiser_device_->info.type == DEVICE_METAL) { + void *mtl_buffer = denoiser_device_->get_native_buffer(ptr); + OIDNBuffer oidn_buffer = oidnNewSharedBufferFromMetal(oidn_device_, mtl_buffer); + + oidnSetFilterImage(filter, + name, + oidn_buffer, + (OIDNFormat)format, + width, + height, + offset_in_bytes, + pixel_stride_in_bytes, + row_stride_in_bytes); + + oidnReleaseBuffer(oidn_buffer); + } + else +# endif + { + oidnSetSharedFilterImage(filter, + name, + (void *)ptr, + (OIDNFormat)format, + width, + height, + offset_in_bytes, + pixel_stride_in_bytes, + row_stride_in_bytes); + } +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/integrator/denoiser_oidn_gpu.h b/intern/cycles/integrator/denoiser_oidn_gpu.h index aad583a5a69..6079088b256 100644 --- a/intern/cycles/integrator/denoiser_oidn_gpu.h +++ b/intern/cycles/integrator/denoiser_oidn_gpu.h @@ -51,6 +51,16 @@ class OIDNDenoiserGPU : public DenoiserGPU { OIDNFilter create_filter(); + void set_filter_pass(OIDNFilter filter, + const char *name, + device_ptr ptr, + int format, + int width, + int height, + size_t offset_in_bytes, + size_t pixel_stride_in_bytes, + size_t row_stride_in_bytes); + OIDNDevice oidn_device_ = nullptr; OIDNFilter oidn_filter_ = nullptr; OIDNFilter albedo_filter_ = nullptr; diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h index 157d09dca13..a839d6a1933 100644 --- a/intern/cycles/integrator/pass_accessor.h +++ b/intern/cycles/integrator/pass_accessor.h @@ -69,6 +69,10 @@ class PassAccessor { * Allows to get pixels of render buffer into a partial slice of the destination. */ int offset = 0; + /* Offset in floats from the beginning of pixels storage. + * Is ignored for half4 destination. */ + int pixel_offset = 0; + /* Number of floats per pixel. When zero is the same as `num_components`. * * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp index 8b5dfac9dc1..b890bedce34 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.cpp +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -47,7 +47,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float( parallel_for(0, buffer_params.window_height, [&](int64_t y) { const float *buffer = window_data + y * buffer_row_stride; - float *pixel = destination.pixels + + float *pixel = destination.pixels + destination.pixel_offset + (y * buffer_params.width + destination.offset) * pixel_stride; func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride); }); diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp index 3d161ce49ef..cb0cf58d08d 100644 --- a/intern/cycles/integrator/pass_accessor_gpu.cpp +++ b/intern/cycles/integrator/pass_accessor_gpu.cpp @@ -48,6 +48,7 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel, &buffer_params.window_width, &offset, &buffer_params.stride, + &destination.pixel_offset, &destination.offset, &destination_stride); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index a16a3631b1e..03f8b80c0a5 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -809,6 +809,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb int width, \ int offset, \ int stride, \ + int channel_offset, \ int rgba_offset, \ int rgba_stride) \ { \ @@ -824,7 +825,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb ccl_global const float *buffer = render_buffer + offset + \ buffer_pixel_index * kfilm_convert.pass_stride; \ \ - ccl_global float *pixel = pixels + \ + ccl_global float *pixel = pixels + channel_offset + \ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ \ FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \