Cycles: add NanoVDB support for Metal on Apple Silicon

Contributed by Yulia Kuznetcova at Apple.

NanoVDB is patched to give add address spaces required by Metal. We hope that
in the future Metal will support the generic address space.

For AMD and Intel this is currently not available since it causes a performance
regression also on scenes without volumes.

Pull Request #104837
This commit is contained in:
Brecht Van Lommel 2023-02-10 18:44:46 +01:00
parent 8b8d8acc84
commit 02c2970983
9 changed files with 8128 additions and 53 deletions

View File

@ -44,13 +44,21 @@ set(OPENVDB_EXTRA_ARGS
# -DLLVM_DIR=${LIBDIR}/llvm/lib/cmake/llvm
)
set(OPENVDB_PATCH ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff)
if(APPLE)
set(OPENVDB_PATCH
${OPENVDB_PATCH} &&
${PATCH_CMD} -p 0 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb_metal.diff
)
endif()
ExternalProject_Add(openvdb
URL file://${PACKAGE_DIR}/${OPENVDB_FILE}
DOWNLOAD_DIR ${DOWNLOAD_DIR}
URL_HASH ${OPENVDB_HASH_TYPE}=${OPENVDB_HASH}
CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
PREFIX ${BUILD_DIR}/openvdb
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff
PATCH_COMMAND ${OPENVDB_PATCH}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/openvdb ${DEFAULT_CMAKE_FLAGS} ${OPENVDB_EXTRA_ARGS}
INSTALL_DIR ${LIBDIR}/openvdb
)

File diff suppressed because it is too large Load Diff

View File

@ -55,9 +55,8 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.denoisers = DENOISER_NONE;
info.id = id;
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
info.has_light_tree = false;
}
info.has_nanovdb = MetalInfo::get_device_vendor(device) == METAL_GPU_APPLE;
info.has_light_tree = MetalInfo::get_device_vendor(device) != METAL_GPU_AMD;
devices.push_back(info);
device_index++;

View File

@ -67,9 +67,12 @@ class MetalDevice : public Device {
std::recursive_mutex metal_mem_map_mutex;
/* Bindless Textures */
bool is_texture(const TextureInfo &tex);
device_vector<TextureInfo> texture_info;
bool need_texture_info;
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
id<MTLBuffer> buffer_bindings_1d = nil;
id<MTLBuffer> texture_bindings_2d = nil;
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;

View File

@ -91,11 +91,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
}
}
texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
switch (device_vendor) {
default:
break;
@ -156,6 +151,16 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_texture.dataType = MTLDataTypeTexture;
arg_desc_texture.access = MTLArgumentAccessReadOnly;
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
arg_desc_buffer.dataType = MTLDataTypePointer;
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
texture_bindings_2d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
/* command queue for non-tracing work on the GPU */
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
@ -180,6 +185,8 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_tex.dataType = MTLDataTypePointer;
arg_desc_tex.access = MTLArgumentAccessReadOnly;
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
arg_desc_tex.index = index++;
@ -253,22 +260,26 @@ MetalDevice::~MetalDevice()
* existing_devices_mutex). */
thread_scoped_lock lock(existing_devices_mutex);
for (auto &tex : texture_slot_map) {
if (tex) {
[tex release];
tex = nil;
int num_resources = texture_info.size();
for (int res = 0; res < num_resources; res++) {
if (is_texture(texture_info[res])) {
[texture_slot_map[res] release];
texture_slot_map[res] = nil;
}
}
flush_delayed_free_list();
if (texture_bindings_2d) {
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
[buffer_bindings_1d release];
[texture_bindings_2d release];
[texture_bindings_3d release];
}
[mtlTextureArgEncoder release];
[mtlBufferKernelParamsEncoder release];
[mtlBufferArgEncoder release];
[mtlASArgEncoder release];
[mtlAncillaryArgEncoder release];
[mtlGeneralCommandQueue release];
@ -332,6 +343,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
break;
case METAL_GPU_APPLE:
global_defines += "#define __KERNEL_METAL_APPLE__\n";
# ifdef WITH_NANOVDB
global_defines += "#define WITH_NANOVDB\n";
# endif
break;
}
@ -546,6 +560,11 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
}
}
bool MetalDevice::is_texture(const TextureInfo &tex)
{
return (tex.depth > 0 || tex.height > 0);
}
void MetalDevice::load_texture_info()
{
if (need_texture_info) {
@ -557,21 +576,20 @@ void MetalDevice::load_texture_info()
for (int tex = 0; tex < num_textures; tex++) {
uint64_t offset = tex * sizeof(void *);
id<MTLTexture> metal_texture = texture_slot_map[tex];
if (!metal_texture) {
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
}
else {
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
id<MTLTexture> metal_texture = texture_slot_map[tex];
MTLTextureType type = metal_texture.textureType;
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
}
else {
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
}
}
if (default_storage_mode == MTLResourceStorageModeManaged) {
[texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
@ -744,7 +762,6 @@ void MetalDevice::generic_free(device_memory &mem)
mem.shared_pointer = 0;
/* Free device memory. */
delayed_free_list.push_back(mmem.mtlBuffer);
mmem.mtlBuffer = nil;
}
@ -979,7 +996,7 @@ void MetalDevice::global_free(device_memory &mem)
void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
{
generic_alloc(mem);
MetalDevice::MetalMem *mmem = generic_alloc(mem);
generic_copy_to(mem);
/* Resize once */
@ -988,27 +1005,32 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
/* Allocate some slots in advance, to reduce amount
* of re-allocations. */
texture_info.resize(round_up(slot + 1, 128));
texture_slot_map.resize(round_up(slot + 1, 128));
}
mem.info.data = (uint64_t)mem.device_pointer;
/* Set Mapping and tag that we need to (re-)upload to device */
texture_info[slot] = mem.info;
uint64_t offset = slot * sizeof(void *);
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
texture_slot_map[slot] = nil;
need_texture_info = true;
}
void MetalDevice::tex_alloc(device_texture &mem)
{
/* Check that dimensions fit within maximum allowable size.
* If 1D texture is allocated, use 1D buffer.
* See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
if (mem.data_width > 16384 || mem.data_height > 16384) {
set_error(string_printf(
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
mem.data_width,
mem.data_height));
return;
if (mem.data_height > 0) {
if (mem.data_width > 16384 || mem.data_height > 16384) {
set_error(string_printf(
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
mem.data_width,
mem.data_height));
return;
}
}
MTLStorageMode storage_mode = MTLStorageModeManaged;
if (@available(macos 10.15, *)) {
if ([mtlDevice hasUnifiedMemory] &&
@ -1148,8 +1170,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
bytesPerRow:src_pitch];
}
else {
assert(0);
/* 1D texture, using linear memory. */
tex_alloc_as_buffer(mem);
return;
}
mem.device_pointer = (device_ptr)mtlTexture;
@ -1173,17 +1196,22 @@ void MetalDevice::tex_alloc(device_texture &mem)
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
if (texture_bindings_2d) {
delayed_free_list.push_back(buffer_bindings_1d);
delayed_free_list.push_back(texture_bindings_2d);
delayed_free_list.push_back(texture_bindings_3d);
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
}
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
}
}
@ -1210,12 +1238,18 @@ void MetalDevice::tex_alloc(device_texture &mem)
void MetalDevice::tex_free(device_texture &mem)
{
if (mem.data_depth == 0 && mem.data_height == 0) {
generic_free(mem);
return;
}
if (metal_mem_map.count(&mem)) {
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
MetalMem &mmem = *metal_mem_map.at(&mem);
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
texture_slot_map[mem.slot] = nil;
if (texture_slot_map[mem.slot] == mmem.mtlTexture)
texture_slot_map[mem.slot] = nil;
if (mmem.mtlTexture) {
/* Free bindless texture. */

View File

@ -477,17 +477,21 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
offset:0
atIndex:1];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
offset:0
atIndex:2];
if (@available(macos 12.0, *)) {
if (metal_device_->use_metalrt) {
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
atIndex:8];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
atIndex:9];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -497,13 +501,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
atIndex:1];
[metal_device_->mtlAncillaryArgEncoder
setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
atIndex:3 + table];
atIndex:4 + table];
[mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
usage:MTLResourceUsageRead];
}
else {
[metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
atIndex:3 + table];
atIndex:4 + table];
}
}
}
@ -874,6 +878,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
/* ancillaries */
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
}
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)

View File

@ -5,13 +5,14 @@
CCL_NAMESPACE_BEGIN
#ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
#if !defined __KERNEL_METAL__
# ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
# endif
#endif
/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
ccl_device float cubic_w0(float a)
{
@ -126,7 +127,7 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl
#ifdef WITH_NANOVDB
template<typename T, typename S>
ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
S &s, float x, float y, float z)
ccl_private S &s, float x, float y, float z)
{
float px = floorf(x);
float py = floorf(y);
@ -157,13 +158,19 @@ ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tric
g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
}
# if defined(__KERNEL_METAL__)
template<typename T>
__attribute__((noinline)) typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
# else
template<typename T>
ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
# endif
{
using namespace nanovdb;
NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
ccl_global NanoGrid<T> *const grid = (ccl_global NanoGrid<T> *)info.data;
typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
AccessorType acc = grid->getAccessor();

View File

@ -290,6 +290,10 @@ typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_t
/* texture bindings and sampler setup */
struct Buffer1DParamsMetal {
device float *buf;
};
struct Texture2DParamsMetal {
texture2d<float, access::sample> tex;
};
@ -306,6 +310,7 @@ struct MetalRTBlasWrapper {
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Texture3DParamsMetal *textures_3d;
device Buffer1DParamsMetal *buffers;
#ifdef __METALRT__
metalrt_as_type accel_struct;

View File

@ -3,6 +3,13 @@
// clang-format off
#ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
#endif
/* Open the Metal kernel context class
* Necessary to access resource bindings */
class MetalKernelContext {