Cycles: add NanoVDB support for Metal on Apple Silicon
Contributed by Yulia Kuznetcova at Apple. NanoVDB is patched to give add address spaces required by Metal. We hope that in the future Metal will support the generic address space. For AMD and Intel this is currently not available since it causes a performance regression also on scenes without volumes. Pull Request #104837
This commit is contained in:
parent
8b8d8acc84
commit
02c2970983
|
@ -44,13 +44,21 @@ set(OPENVDB_EXTRA_ARGS
|
|||
# -DLLVM_DIR=${LIBDIR}/llvm/lib/cmake/llvm
|
||||
)
|
||||
|
||||
set(OPENVDB_PATCH ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff)
|
||||
if(APPLE)
|
||||
set(OPENVDB_PATCH
|
||||
${OPENVDB_PATCH} &&
|
||||
${PATCH_CMD} -p 0 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb_metal.diff
|
||||
)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(openvdb
|
||||
URL file://${PACKAGE_DIR}/${OPENVDB_FILE}
|
||||
DOWNLOAD_DIR ${DOWNLOAD_DIR}
|
||||
URL_HASH ${OPENVDB_HASH_TYPE}=${OPENVDB_HASH}
|
||||
CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
|
||||
PREFIX ${BUILD_DIR}/openvdb
|
||||
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff
|
||||
PATCH_COMMAND ${OPENVDB_PATCH}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/openvdb ${DEFAULT_CMAKE_FLAGS} ${OPENVDB_EXTRA_ARGS}
|
||||
INSTALL_DIR ${LIBDIR}/openvdb
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -55,9 +55,8 @@ void device_metal_info(vector<DeviceInfo> &devices)
|
|||
info.denoisers = DENOISER_NONE;
|
||||
info.id = id;
|
||||
|
||||
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
|
||||
info.has_light_tree = false;
|
||||
}
|
||||
info.has_nanovdb = MetalInfo::get_device_vendor(device) == METAL_GPU_APPLE;
|
||||
info.has_light_tree = MetalInfo::get_device_vendor(device) != METAL_GPU_AMD;
|
||||
|
||||
devices.push_back(info);
|
||||
device_index++;
|
||||
|
|
|
@ -67,9 +67,12 @@ class MetalDevice : public Device {
|
|||
std::recursive_mutex metal_mem_map_mutex;
|
||||
|
||||
/* Bindless Textures */
|
||||
bool is_texture(const TextureInfo &tex);
|
||||
device_vector<TextureInfo> texture_info;
|
||||
bool need_texture_info;
|
||||
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
|
||||
id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
|
||||
id<MTLBuffer> buffer_bindings_1d = nil;
|
||||
id<MTLBuffer> texture_bindings_2d = nil;
|
||||
id<MTLBuffer> texture_bindings_3d = nil;
|
||||
std::vector<id<MTLTexture>> texture_slot_map;
|
||||
|
|
|
@ -91,11 +91,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
|||
}
|
||||
}
|
||||
|
||||
texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
|
||||
texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
|
||||
|
||||
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
|
||||
|
||||
switch (device_vendor) {
|
||||
default:
|
||||
break;
|
||||
|
@ -156,6 +151,16 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
|||
arg_desc_texture.dataType = MTLDataTypeTexture;
|
||||
arg_desc_texture.access = MTLArgumentAccessReadOnly;
|
||||
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
|
||||
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_buffer.dataType = MTLDataTypePointer;
|
||||
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
|
||||
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
|
||||
|
||||
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
|
||||
texture_bindings_2d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
|
||||
texture_bindings_3d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
|
||||
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
||||
texture_bindings_3d.allocatedSize);
|
||||
|
||||
/* command queue for non-tracing work on the GPU */
|
||||
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
|
||||
|
@ -180,6 +185,8 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
|||
arg_desc_tex.dataType = MTLDataTypePointer;
|
||||
arg_desc_tex.access = MTLArgumentAccessReadOnly;
|
||||
|
||||
arg_desc_tex.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
|
||||
arg_desc_tex.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
|
||||
arg_desc_tex.index = index++;
|
||||
|
@ -253,22 +260,26 @@ MetalDevice::~MetalDevice()
|
|||
* existing_devices_mutex). */
|
||||
thread_scoped_lock lock(existing_devices_mutex);
|
||||
|
||||
for (auto &tex : texture_slot_map) {
|
||||
if (tex) {
|
||||
[tex release];
|
||||
tex = nil;
|
||||
int num_resources = texture_info.size();
|
||||
for (int res = 0; res < num_resources; res++) {
|
||||
if (is_texture(texture_info[res])) {
|
||||
[texture_slot_map[res] release];
|
||||
texture_slot_map[res] = nil;
|
||||
}
|
||||
}
|
||||
|
||||
flush_delayed_free_list();
|
||||
|
||||
if (texture_bindings_2d) {
|
||||
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
|
||||
|
||||
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
||||
texture_bindings_3d.allocatedSize);
|
||||
[buffer_bindings_1d release];
|
||||
[texture_bindings_2d release];
|
||||
[texture_bindings_3d release];
|
||||
}
|
||||
[mtlTextureArgEncoder release];
|
||||
[mtlBufferKernelParamsEncoder release];
|
||||
[mtlBufferArgEncoder release];
|
||||
[mtlASArgEncoder release];
|
||||
[mtlAncillaryArgEncoder release];
|
||||
[mtlGeneralCommandQueue release];
|
||||
|
@ -332,6 +343,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
|
|||
break;
|
||||
case METAL_GPU_APPLE:
|
||||
global_defines += "#define __KERNEL_METAL_APPLE__\n";
|
||||
# ifdef WITH_NANOVDB
|
||||
global_defines += "#define WITH_NANOVDB\n";
|
||||
# endif
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -546,6 +560,11 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
|
|||
}
|
||||
}
|
||||
|
||||
bool MetalDevice::is_texture(const TextureInfo &tex)
|
||||
{
|
||||
return (tex.depth > 0 || tex.height > 0);
|
||||
}
|
||||
|
||||
void MetalDevice::load_texture_info()
|
||||
{
|
||||
if (need_texture_info) {
|
||||
|
@ -557,21 +576,20 @@ void MetalDevice::load_texture_info()
|
|||
|
||||
for (int tex = 0; tex < num_textures; tex++) {
|
||||
uint64_t offset = tex * sizeof(void *);
|
||||
|
||||
id<MTLTexture> metal_texture = texture_slot_map[tex];
|
||||
if (!metal_texture) {
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
||||
}
|
||||
else {
|
||||
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
|
||||
id<MTLTexture> metal_texture = texture_slot_map[tex];
|
||||
MTLTextureType type = metal_texture.textureType;
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
|
||||
}
|
||||
else {
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
||||
}
|
||||
}
|
||||
if (default_storage_mode == MTLResourceStorageModeManaged) {
|
||||
[texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
|
||||
|
@ -744,7 +762,6 @@ void MetalDevice::generic_free(device_memory &mem)
|
|||
mem.shared_pointer = 0;
|
||||
|
||||
/* Free device memory. */
|
||||
delayed_free_list.push_back(mmem.mtlBuffer);
|
||||
mmem.mtlBuffer = nil;
|
||||
}
|
||||
|
||||
|
@ -979,7 +996,7 @@ void MetalDevice::global_free(device_memory &mem)
|
|||
|
||||
void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
|
||||
{
|
||||
generic_alloc(mem);
|
||||
MetalDevice::MetalMem *mmem = generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
|
||||
/* Resize once */
|
||||
|
@ -988,27 +1005,32 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
|
|||
/* Allocate some slots in advance, to reduce amount
|
||||
* of re-allocations. */
|
||||
texture_info.resize(round_up(slot + 1, 128));
|
||||
texture_slot_map.resize(round_up(slot + 1, 128));
|
||||
}
|
||||
|
||||
mem.info.data = (uint64_t)mem.device_pointer;
|
||||
|
||||
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||
texture_info[slot] = mem.info;
|
||||
uint64_t offset = slot * sizeof(void *);
|
||||
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
|
||||
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
|
||||
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
|
||||
texture_slot_map[slot] = nil;
|
||||
need_texture_info = true;
|
||||
}
|
||||
|
||||
void MetalDevice::tex_alloc(device_texture &mem)
|
||||
{
|
||||
/* Check that dimensions fit within maximum allowable size.
|
||||
* If 1D texture is allocated, use 1D buffer.
|
||||
* See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
|
||||
if (mem.data_width > 16384 || mem.data_height > 16384) {
|
||||
set_error(string_printf(
|
||||
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
|
||||
mem.data_width,
|
||||
mem.data_height));
|
||||
return;
|
||||
if (mem.data_height > 0) {
|
||||
if (mem.data_width > 16384 || mem.data_height > 16384) {
|
||||
set_error(string_printf(
|
||||
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
|
||||
mem.data_width,
|
||||
mem.data_height));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
MTLStorageMode storage_mode = MTLStorageModeManaged;
|
||||
if (@available(macos 10.15, *)) {
|
||||
if ([mtlDevice hasUnifiedMemory] &&
|
||||
|
@ -1148,8 +1170,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
|||
bytesPerRow:src_pitch];
|
||||
}
|
||||
else {
|
||||
assert(0);
|
||||
/* 1D texture, using linear memory. */
|
||||
tex_alloc_as_buffer(mem);
|
||||
return;
|
||||
}
|
||||
|
||||
mem.device_pointer = (device_ptr)mtlTexture;
|
||||
|
@ -1173,17 +1196,22 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
|||
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
|
||||
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
|
||||
if (texture_bindings_2d) {
|
||||
delayed_free_list.push_back(buffer_bindings_1d);
|
||||
delayed_free_list.push_back(texture_bindings_2d);
|
||||
delayed_free_list.push_back(texture_bindings_3d);
|
||||
|
||||
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
|
||||
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
||||
texture_bindings_3d.allocatedSize);
|
||||
}
|
||||
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:default_storage_mode];
|
||||
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:default_storage_mode];
|
||||
texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:default_storage_mode];
|
||||
|
||||
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
|
||||
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
||||
texture_bindings_3d.allocatedSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1210,12 +1238,18 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
|||
|
||||
void MetalDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
if (mem.data_depth == 0 && mem.data_height == 0) {
|
||||
generic_free(mem);
|
||||
return;
|
||||
}
|
||||
|
||||
if (metal_mem_map.count(&mem)) {
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
MetalMem &mmem = *metal_mem_map.at(&mem);
|
||||
|
||||
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
|
||||
texture_slot_map[mem.slot] = nil;
|
||||
if (texture_slot_map[mem.slot] == mmem.mtlTexture)
|
||||
texture_slot_map[mem.slot] = nil;
|
||||
|
||||
if (mmem.mtlTexture) {
|
||||
/* Free bindless texture. */
|
||||
|
|
|
@ -477,17 +477,21 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
|
||||
offset:0
|
||||
atIndex:1];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
|
||||
offset:0
|
||||
atIndex:2];
|
||||
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (metal_device_->use_metalrt) {
|
||||
if (metal_device_->bvhMetalRT) {
|
||||
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
|
||||
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
|
||||
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
|
||||
offset:0
|
||||
atIndex:7];
|
||||
atIndex:8];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
|
||||
offset:0
|
||||
atIndex:8];
|
||||
atIndex:9];
|
||||
}
|
||||
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
|
@ -497,13 +501,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
atIndex:1];
|
||||
[metal_device_->mtlAncillaryArgEncoder
|
||||
setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
|
||||
atIndex:3 + table];
|
||||
atIndex:4 + table];
|
||||
[mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
|
||||
usage:MTLResourceUsageRead];
|
||||
}
|
||||
else {
|
||||
[metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
|
||||
atIndex:3 + table];
|
||||
atIndex:4 + table];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -874,6 +878,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
|
|||
/* ancillaries */
|
||||
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
|
||||
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
|
||||
[mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
|
||||
}
|
||||
|
||||
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
|
||||
|
|
|
@ -5,13 +5,14 @@
|
|||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
# define NDEBUG /* Disable "assert" in device code */
|
||||
# define NANOVDB_USE_INTRINSICS
|
||||
# include "nanovdb/NanoVDB.h"
|
||||
# include "nanovdb/util/SampleFromVoxels.h"
|
||||
#if !defined __KERNEL_METAL__
|
||||
# ifdef WITH_NANOVDB
|
||||
# define NDEBUG /* Disable "assert" in device code */
|
||||
# define NANOVDB_USE_INTRINSICS
|
||||
# include "nanovdb/NanoVDB.h"
|
||||
# include "nanovdb/util/SampleFromVoxels.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
|
||||
ccl_device float cubic_w0(float a)
|
||||
{
|
||||
|
@ -126,7 +127,7 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl
|
|||
#ifdef WITH_NANOVDB
|
||||
template<typename T, typename S>
|
||||
ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
|
||||
S &s, float x, float y, float z)
|
||||
ccl_private S &s, float x, float y, float z)
|
||||
{
|
||||
float px = floorf(x);
|
||||
float py = floorf(y);
|
||||
|
@ -157,13 +158,19 @@ ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tric
|
|||
g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
|
||||
}
|
||||
|
||||
# if defined(__KERNEL_METAL__)
|
||||
template<typename T>
|
||||
__attribute__((noinline)) typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
|
||||
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
|
||||
# else
|
||||
template<typename T>
|
||||
ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
|
||||
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
|
||||
# endif
|
||||
{
|
||||
using namespace nanovdb;
|
||||
|
||||
NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
|
||||
ccl_global NanoGrid<T> *const grid = (ccl_global NanoGrid<T> *)info.data;
|
||||
typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
|
||||
AccessorType acc = grid->getAccessor();
|
||||
|
||||
|
|
|
@ -290,6 +290,10 @@ typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_t
|
|||
|
||||
/* texture bindings and sampler setup */
|
||||
|
||||
struct Buffer1DParamsMetal {
|
||||
device float *buf;
|
||||
};
|
||||
|
||||
struct Texture2DParamsMetal {
|
||||
texture2d<float, access::sample> tex;
|
||||
};
|
||||
|
@ -306,6 +310,7 @@ struct MetalRTBlasWrapper {
|
|||
struct MetalAncillaries {
|
||||
device Texture2DParamsMetal *textures_2d;
|
||||
device Texture3DParamsMetal *textures_3d;
|
||||
device Buffer1DParamsMetal *buffers;
|
||||
|
||||
#ifdef __METALRT__
|
||||
metalrt_as_type accel_struct;
|
||||
|
|
|
@ -3,6 +3,13 @@
|
|||
|
||||
// clang-format off
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
# define NDEBUG /* Disable "assert" in device code */
|
||||
# define NANOVDB_USE_INTRINSICS
|
||||
# include "nanovdb/NanoVDB.h"
|
||||
# include "nanovdb/util/SampleFromVoxels.h"
|
||||
#endif
|
||||
|
||||
/* Open the Metal kernel context class
|
||||
* Necessary to access resource bindings */
|
||||
class MetalKernelContext {
|
||||
|
|
Loading…
Reference in New Issue