Cycles: Tune kernel sizes for oneAPI device

This brings a 1-3% performance improvement depending on the scenes, on
the Arc A770.
This commit is contained in:
Xavier Hallade 2024-04-04 16:04:13 +02:00
parent 022e46a7e2
commit cbc7962a73
1 changed files with 17 additions and 10 deletions

View File

@ -773,8 +773,9 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
size_t &kernel_local_size)
{
assert(queue);
const static size_t preferred_work_group_size_intersect_shading = 32;
const static size_t preferred_work_group_size_intersect = 128;
const static size_t preferred_work_group_size_shading = 256;
const static size_t preferred_work_group_size_shading_simd8 = 64;
/* Shader evaluation kernels seems to use some amount of shared memory, so better
* to avoid usage of maximum work group sizes for them. */
const static size_t preferred_work_group_size_shader_evaluation = 256;
@ -783,6 +784,9 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
const static size_t preferred_work_group_size_cryptomatte = 512;
const static size_t preferred_work_group_size_default = 1024;
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
size_t preferred_work_group_size = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
@ -792,6 +796,9 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT:
preferred_work_group_size = preferred_work_group_size_intersect;
break;
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
@ -799,9 +806,13 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT:
preferred_work_group_size = preferred_work_group_size_intersect_shading;
break;
case DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT: {
const bool device_is_simd8 =
(device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
preferred_work_group_size_shading;
} break;
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
preferred_work_group_size = preferred_work_group_size_cryptomatte;
@ -829,11 +840,7 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
preferred_work_group_size = preferred_work_group_size_default;
}
const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::max_work_group_size>();
kernel_local_size = std::min(limit_work_group_size, preferred_work_group_size);
kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */