EEVEE Next: Horizon scan Metal compiler tuning
Tune Metal compilation parameters for horizon scan shaders for optimal performance. Selectively unrolling loops and modifying compilation heuristics results in a ~25% uplift in tracing shader performance, due to improved latency management. Authored by Apple: Michael Parkin-White Pull Request: https://projects.blender.org/blender/blender/pulls/119737
This commit is contained in:
parent
cbc7962a73
commit
93cc55889c
|
@ -112,6 +112,10 @@ HorizonScanResult horizon_scan_eval(vec3 vP,
|
||||||
float occlusion_accum = 0.0;
|
float occlusion_accum = 0.0;
|
||||||
SphericalHarmonicL1 sh_accum = spherical_harmonics_L1_new();
|
SphericalHarmonicL1 sh_accum = spherical_harmonics_L1_new();
|
||||||
|
|
||||||
|
#if defined(GPU_METAL) && defined(GPU_APPLE)
|
||||||
|
/* NOTE: Full loop unroll hint increases performance on Apple Silicon. */
|
||||||
|
# pragma clang loop unroll(full)
|
||||||
|
#endif
|
||||||
for (int slice = 0; slice < slice_len; slice++) {
|
for (int slice = 0; slice < slice_len; slice++) {
|
||||||
#if 0 /* For debug purpose. For when slice_len is greater than 2. */
|
#if 0 /* For debug purpose. For when slice_len is greater than 2. */
|
||||||
vec2 v_dir = sample_circle(((float(slice) + noise.x) / float(slice_len)));
|
vec2 v_dir = sample_circle(((float(slice) + noise.x) / float(slice_len)));
|
||||||
|
@ -145,6 +149,10 @@ HorizonScanResult horizon_scan_eval(vec3 vP,
|
||||||
* screen at once and just scan through. */
|
* screen at once and just scan through. */
|
||||||
ScreenSpaceRay ssray = raytrace_screenspace_ray_create(ray, pixel_size);
|
ScreenSpaceRay ssray = raytrace_screenspace_ray_create(ray, pixel_size);
|
||||||
|
|
||||||
|
#if defined(GPU_METAL) && defined(GPU_APPLE)
|
||||||
|
/* NOTE: Full loop unroll hint increases performance on Apple Silicon. */
|
||||||
|
# pragma clang loop unroll(full)
|
||||||
|
#endif
|
||||||
for (int j = 0; j < sample_count; j++) {
|
for (int j = 0; j < sample_count; j++) {
|
||||||
/* Always cross at least one pixel. */
|
/* Always cross at least one pixel. */
|
||||||
float time = 1.0 + square((float(j) + noise.y) / float(sample_count)) * ssray.max_time;
|
float time = 1.0 + square((float(j) + noise.y) / float(sample_count)) * ssray.max_time;
|
||||||
|
|
|
@ -203,6 +203,8 @@ GPU_SHADER_CREATE_INFO(eevee_horizon_scan)
|
||||||
.image(4, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_2_img")
|
.image(4, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_2_img")
|
||||||
.image(5, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_3_img")
|
.image(5, GPU_RGBA8, Qualifier::WRITE, ImageType::FLOAT_2D, "horizon_radiance_3_img")
|
||||||
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
|
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
|
||||||
|
/* Metal: Provide compiler with hint to tune per-thread resource allocation. */
|
||||||
|
.mtl_max_total_threads_per_threadgroup(400)
|
||||||
.compute_source("eevee_horizon_scan_comp.glsl");
|
.compute_source("eevee_horizon_scan_comp.glsl");
|
||||||
|
|
||||||
GPU_SHADER_CREATE_INFO(eevee_horizon_denoise)
|
GPU_SHADER_CREATE_INFO(eevee_horizon_denoise)
|
||||||
|
@ -241,6 +243,8 @@ GPU_SHADER_CREATE_INFO(eevee_horizon_resolve)
|
||||||
.image(4, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure1_img")
|
.image(4, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure1_img")
|
||||||
.image(5, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure2_img")
|
.image(5, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "closure2_img")
|
||||||
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
|
.storage_buf(7, Qualifier::READ, "uint", "tiles_coord_buf[]")
|
||||||
|
/* Metal: Provide compiler with hint to tune per-thread resource allocation. */
|
||||||
|
.mtl_max_total_threads_per_threadgroup(400)
|
||||||
.compute_source("eevee_horizon_resolve_comp.glsl");
|
.compute_source("eevee_horizon_resolve_comp.glsl");
|
||||||
|
|
||||||
#undef image_out
|
#undef image_out
|
||||||
|
|
|
@ -81,6 +81,9 @@ static void standard_defines(Vector<const char *> &sources)
|
||||||
else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_ANY, GPU_DRIVER_ANY)) {
|
else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_ANY, GPU_DRIVER_ANY)) {
|
||||||
sources.append("#define GPU_INTEL\n");
|
sources.append("#define GPU_INTEL\n");
|
||||||
}
|
}
|
||||||
|
else if (GPU_type_matches(GPU_DEVICE_APPLE, GPU_OS_ANY, GPU_DRIVER_ANY)) {
|
||||||
|
sources.append("#define GPU_APPLE\n");
|
||||||
|
}
|
||||||
/* some useful defines to detect OS type */
|
/* some useful defines to detect OS type */
|
||||||
if (GPU_type_matches(GPU_DEVICE_ANY, GPU_OS_WIN, GPU_DRIVER_ANY)) {
|
if (GPU_type_matches(GPU_DEVICE_ANY, GPU_OS_WIN, GPU_DRIVER_ANY)) {
|
||||||
sources.append("#define OS_WIN\n");
|
sources.append("#define OS_WIN\n");
|
||||||
|
|
Loading…
Reference in New Issue