EEVEE Next: Horizon scan Metal compiler tuning

Tune Metal compilation parameters for horizon scan
shaders for optimal performance. Selectively unrolling
loops and modifying compilation heuristics results in a
~25% uplift in tracing shader performance, due to
improved latency management.

Authored by Apple: Michael Parkin-White

Pull Request: https://projects.blender.org/blender/blender/pulls/119737
This commit is contained in:
Jason Fielder 2024-04-04 16:24:20 +02:00 committed by Clément Foucault
parent cbc7962a73
commit 93cc55889c
3 changed files with 15 additions and 0 deletions

View File

@ -112,6 +112,10 @@ HorizonScanResult horizon_scan_eval(vec3 vP,
float occlusion_accum = 0.0;
SphericalHarmonicL1 sh_accum = spherical_harmonics_L1_new();
#if defined(GPU_METAL) && defined(GPU_APPLE)
/* NOTE: Full loop unroll hint increases performance on Apple Silicon. */
# pragma clang loop unroll(full)
#endif
for (int slice = 0; slice < slice_len; slice++) {
#if 0 /* For debug purpose. For when slice_len is greater than 2. */
vec2 v_dir = sample_circle(((float(slice) + noise.x) / float(slice_len)));
@ -145,6 +149,10 @@ HorizonScanResult horizon_scan_eval(vec3 vP,
* screen at once and just scan through. */
ScreenSpaceRay ssray = raytrace_screenspace_ray_create(ray, pixel_size);
#if defined(GPU_METAL) && defined(GPU_APPLE)
/* NOTE: Full loop unroll hint increases performance on Apple Silicon. */
# pragma clang loop unroll(full)
#endif
for (int j = 0; j < sample_count; j++) {
/* Always cross at least one pixel. */
float time = 1.0 + square((float(j) + noise.y) / float(sample_count)) * ssray.max_time;

View File

@ -203,6 +203,8 @@ GPU_SHADER_CREATE_INFO(eevee_horizon_scan)
.image(4, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_2_img")
.image(5, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_3_img")
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
/* Metal: Provide compiler with hint to tune per-thread resource allocation. */
.mtl_max_total_threads_per_threadgroup(400)
.compute_source("eevee_horizon_scan_comp.glsl");
GPU_SHADER_CREATE_INFO(eevee_horizon_denoise)
@ -241,6 +243,8 @@ GPU_SHADER_CREATE_INFO(eevee_horizon_resolve)
.image(4, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure1_img")
.image(5, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure2_img")
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
/* Metal: Provide compiler with hint to tune per-thread resource allocation. */
.mtl_max_total_threads_per_threadgroup(400)
.compute_source("eevee_horizon_resolve_comp.glsl");
#undef image_out

View File

@ -81,6 +81,9 @@ static void standard_defines(Vector<const char *> &sources)
else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_ANY, GPU_DRIVER_ANY)) {
sources.append("#define GPU_INTEL\n");
}
else if (GPU_type_matches(GPU_DEVICE_APPLE, GPU_OS_ANY, GPU_DRIVER_ANY)) {
sources.append("#define GPU_APPLE\n");
}
/* some useful defines to detect OS type */
if (GPU_type_matches(GPU_DEVICE_ANY, GPU_OS_WIN, GPU_DRIVER_ANY)) {
sources.append("#define OS_WIN\n");