Various Exact Boolean parallelizations and optimizations.

From patch D11780 from Erik Abrahamsson. It parallelizes making the vertices, destruction of map entries, finding if the result is PWN, finding triangle adjacencies, and finding the ambient cell. The latter needs a parallel_reduce from tbb, so added one into BLI_task.hh so that if WITH_TBB is false, the code will still work. On Erik's 6-core machine, the elapsed time went from 17.5s to 11.8s (33% faster) on an intersection of two spheres with 3.1M faces. On Howard's 24-core machine, the elapsed time went from 18.7s to 10.8s for the same test.
2021-07-05 18:09:36 -04:00 · 2021-07-05 18:09:36 -04:00 · ceff86aafe
parent cf17f7e0cc
commit ceff86aafe
5 changed files with 286 additions and 97 deletions
--- a/source/blender/blenkernel/intern/mesh_boolean_convert.cc
+++ b/source/blender/blenkernel/intern/mesh_boolean_convert.cc
@ -38,6 +38,7 @@
 #include "BLI_mesh_boolean.hh"
 #include "BLI_mesh_intersect.hh"
 #include "BLI_span.hh"
 #include "BLI_task.hh"
 namespace blender::meshintersect {
@ -309,22 +310,38 @@ static IMesh meshes_to_imesh(Span<const Mesh *> meshes,
                                                        clean_obmat(*obmats[mi]);
    r_info->to_target_transform[mi] = inv_target_mat * objn_mat;
-    /* Skip the matrix multiplication for each point when there is no transform for a mesh,
+    Vector<Vert *> verts(me->totvert);
-     * for example when the first mesh is already in the target space. (Note the logic directly
+    Span<MVert> mverts = Span(me->mvert, me->totvert);
-     * above, which uses an identity matrix with a null input transform). */
+
    /* Allocate verts
     * Skip the matrix multiplication for each point when there is no transform for a mesh,
     * for example when the first mesh is already in the target space. (Note the logic
     * directly above, which uses an identity matrix with a null input transform). */
    if (obmats[mi] == nullptr) {
-      for (const MVert &vert : Span(me->mvert, me->totvert)) {
+      threading::parallel_for(mverts.index_range(), 2048, [&](IndexRange range) {
-        const float3 co = float3(vert.co);
+        float3 co;
-        r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(mpq3(co.x, co.y, co.z), v);
+        for (int i : range) {
-        ++v;
+          co = float3(mverts[i].co);
-      }
+          mpq3 mco = mpq3(co.x, co.y, co.z);
          double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
          verts[i] = new Vert(mco, dco, NO_INDEX, i);
        }
      });
    }
    else {
-      for (const MVert &vert : Span(me->mvert, me->totvert)) {
+      threading::parallel_for(mverts.index_range(), 2048, [&](IndexRange range) {
-        const float3 co = r_info->to_target_transform[mi] * float3(vert.co);
+        float3 co;
-        r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(mpq3(co.x, co.y, co.z), v);
+        for (int i : range) {
-        ++v;
+          co = r_info->to_target_transform[mi] * float3(mverts[i].co);
-      }
+          mpq3 mco = mpq3(co.x, co.y, co.z);
          double3 dco(mco[0].get_d(), mco[1].get_d(), mco[2].get_d());
          verts[i] = new Vert(mco, dco, NO_INDEX, i);
        }
      });
    }
    for (int i : mverts.index_range()) {
      r_info->mesh_to_imesh_vert[v] = arena.add_or_find_vert(verts[i]);
      ++v;
    }
    for (const MPoly &poly : Span(me->mpoly, me->totpoly)) {
--- a/source/blender/blenlib/BLI_mesh_intersect.hh
+++ b/source/blender/blenlib/BLI_mesh_intersect.hh
@ -225,6 +225,7 @@ class IMeshArena : NonCopyable, NonMovable {
   */
  const Vert *add_or_find_vert(const mpq3 &co, int orig);
  const Vert *add_or_find_vert(const double3 &co, int orig);
  const Vert *add_or_find_vert(Vert *vert);
  Face *add_face(Span<const Vert *> verts,
                 int orig,
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@ -28,6 +28,7 @@
 #    define NOMINMAX
 #    define TBB_MIN_MAX_CLEANUP
 #  endif
 #  include "tbb/parallel_reduce.h"
 #  include <tbb/blocked_range.h>
 #  include <tbb/parallel_for.h>
 #  include <tbb/parallel_for_each.h>
@ -76,6 +77,27 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
 #endif
 }
 template<typename Value, typename Function, typename Reduction>
 Value parallel_reduce(IndexRange range,
                      int64_t grain_size,
                      const Value &identity,
                      const Function &function,
                      const Reduction &reduction)
 {
 #ifdef WITH_TBB
  return tbb::parallel_reduce(
      tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
      identity,
      [&](const tbb::blocked_range<int64_t> &subrange, const Value &ident) {
        return function(IndexRange(subrange.begin(), subrange.size()), ident);
      },
      reduction);
 #else
  UNUSED_VARS(grain_size, reduction);
  return function(range, identity);
 #endif
 }
 /** See #BLI_task_isolate for a description of what isolating a task means. */
 template<typename Function> void isolate_task(const Function &function)
 {
--- a/source/blender/blenlib/intern/mesh_boolean.cc
+++ b/source/blender/blenlib/intern/mesh_boolean.cc
@ -21,6 +21,7 @@
 #ifdef WITH_GMP
 #  include <algorithm>
 #  include <atomic>
 #  include <fstream>
 #  include <iostream>
@ -50,6 +51,7 @@
 #  include "BLI_mesh_boolean.hh"
 #  ifdef WITH_TBB
 #    include "tbb/parallel_reduce.h"
 #    include "tbb/spin_mutex.h"
 #  endif
@ -201,9 +203,14 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
        BLI_assert(edges != nullptr);
      }
      edges->append_non_duplicates(e);
-      auto createf = [t](Vector<int> **pvec) { *pvec = new Vector<int>{t}; };
+
-      auto modifyf = [t](Vector<int> **pvec) { (*pvec)->append_non_duplicates(t); };
+      auto p = edge_tri_.lookup_ptr(Edge(v, vnext));
-      this->edge_tri_.add_or_modify(Edge(v, vnext), createf, modifyf);
+      if (p == nullptr) {
        edge_tri_.add_new(e, new Vector<int>{t});
      }
      else {
        (*p)->append_non_duplicates(t);
      }
    }
  }
  /* Debugging. */
@ -228,9 +235,18 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
 TriMeshTopology::~TriMeshTopology()
 {
-  for (const Vector<int> *vec : edge_tri_.values()) {
+  Vector<Vector<int> *> values;
-    delete vec;
+
  /* Deconstructing is faster in parallel, so it is worth building an array of things to delete. */
  for (auto item : edge_tri_.values()) {
    values.append(item);
  }
  threading::parallel_for(values.index_range(), 256, [&](IndexRange range) {
    for (int i : range) {
      delete values[i];
    }
  });
 }
 /** A Patch is a maximal set of triangles that share manifold edges only. */
@ -719,6 +735,18 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
  PatchesInfo pinfo(ntri);
  /* Algorithm: Grow patches across manifold edges as long as there are unassigned triangles. */
  Stack<int> cur_patch_grow;
  /* Create an Array containing indices of adjacent faces. */
  Array<std::array<int, 3>> t_others(tm.face_size());
  threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
    for (int t : range) {
      const Face &tri = *tm.face(t);
      for (int i = 0; i < 3; ++i) {
        Edge e(tri[i], tri[(i + 1) % 3]);
        t_others[t][i] = tmtopo.other_tri_if_manifold(e, t);
      }
    }
  });
  for (int t : tm.face_index_range()) {
    if (pinfo.tri_patch(t) == -1) {
      cur_patch_grow.push(t);
@ -739,7 +767,7 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
        const Face &tri = *tm.face(tcand);
        for (int i = 0; i < 3; ++i) {
          Edge e(tri[i], tri[(i + 1) % 3]);
-          int t_other = tmtopo.other_tri_if_manifold(e, tcand);
+          int t_other = t_others[tcand][i];
          if (dbg_level > 1) {
            std::cout << "  edge " << e << " generates t_other=" << t_other << "\n";
          }
@ -953,12 +981,8 @@ static void sort_by_signed_triangle_index(Vector<int> &g,
 * To accommodate this:
 * If extra_tri is non-null, then an index of EXTRA_TRI_INDEX should use it for the triangle.
 */
-static Array<int> sort_tris_around_edge(const IMesh &tm,
+static Array<int> sort_tris_around_edge(
-                                        const TriMeshTopology &tmtopo,
+    const IMesh &tm, const Edge e, const Span<int> tris, const int t0, const Face *extra_tri)
                                        const Edge e,
                                        const Span<int> tris,
                                        const int t0,
                                        const Face *extra_tri)
 {
  /* Divide and conquer, quick-sort-like sort.
   * Pick a triangle t0, then partition into groups:
@ -1023,14 +1047,14 @@ static Array<int> sort_tris_around_edge(const IMesh &tm,
    }
  }
  if (g3.size() > 1) {
-    Array<int> g3sorted = sort_tris_around_edge(tm, tmtopo, e, g3, t0, extra_tri);
+    Array<int> g3sorted = sort_tris_around_edge(tm, e, g3, t0, extra_tri);
    std::copy(g3sorted.begin(), g3sorted.end(), g3.begin());
    if (dbg_level > 1) {
      std::cout << "g3 sorted: " << g3 << "\n";
    }
  }
  if (g4.size() > 1) {
-    Array<int> g4sorted = sort_tris_around_edge(tm, tmtopo, e, g4, t0, extra_tri);
+    Array<int> g4sorted = sort_tris_around_edge(tm, e, g4, t0, extra_tri);
    std::copy(g4sorted.begin(), g4sorted.end(), g4.begin());
    if (dbg_level > 1) {
      std::cout << "g4 sorted: " << g4 << "\n";
@ -1076,7 +1100,7 @@ static void find_cells_from_edge(const IMesh &tm,
  const Vector<int> *edge_tris = tmtopo.edge_tris(e);
  BLI_assert(edge_tris != nullptr);
  Array<int> sorted_tris = sort_tris_around_edge(
-      tm, tmtopo, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
+      tm, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
  int n_edge_tris = edge_tris->size();
  Array<int> edge_patches(n_edge_tris);
@ -1338,34 +1362,46 @@ static bool patch_cell_graph_ok(const CellsInfo &cinfo, const PatchesInfo &pinfo
 static bool is_pwn(const IMesh &tm, const TriMeshTopology &tmtopo)
 {
  constexpr int dbg_level = 0;
  std::atomic<bool> is_pwn = true;
  Vector<std::pair<Edge, Vector<int> *>> tris;
  for (auto item : tmtopo.edge_tri_map_items()) {
-    const Edge &edge = item.key;
+    tris.append(std::pair<Edge, Vector<int> *>(item.key, item.value));
-    int tot_orient = 0;
+  }
-    /* For each face t attached to edge, add +1 if the edge
+
-     * is positively in t, and -1 if negatively in t. */
+  threading::parallel_for(tris.index_range(), 2048, [&](IndexRange range) {
-    for (int t : *item.value) {
+    for (int j : range) {
-      const Face &face = *tm.face(t);
+      const Edge &edge = tris[j].first;
-      BLI_assert(face.size() == 3);
+      int tot_orient = 0;
-      for (int i : face.index_range()) {
+      /* For each face t attached to edge, add +1 if the edge
-        if (face[i] == edge.v0()) {
+       * is positively in t, and -1 if negatively in t. */
-          if (face[(i + 1) % 3] == edge.v1()) {
+      for (int t : *tris[j].second) {
-            ++tot_orient;
+        const Face &face = *tm.face(t);
-          }
+        BLI_assert(face.size() == 3);
-          else {
+        for (int i : face.index_range()) {
-            BLI_assert(face[(i + 3 - 1) % 3] == edge.v1());
+          if (face[i] == edge.v0()) {
-            --tot_orient;
+            if (face[(i + 1) % 3] == edge.v1()) {
              ++tot_orient;
            }
            else {
              BLI_assert(face[(i + 3 - 1) % 3] == edge.v1());
              --tot_orient;
            }
          }
        }
      }
-    }
+      if (tot_orient != 0) {
-    if (tot_orient != 0) {
+        if (dbg_level > 0) {
-      if (dbg_level > 0) {
+          std::cout << "edge causing non-pwn: " << edge << "\n";
-        std::cout << "edge causing non-pwn: " << edge << "\n";
+        }
        is_pwn = false;
 #  ifdef WITH_TBB
        tbb::task::self().cancel_group_execution();
 #  endif
      }
      return false;
    }
-  }
+  });
-  return true;
+  return is_pwn.load();
 }
 /**
@ -1396,8 +1432,7 @@ static int find_cell_for_point_near_edge(mpq3 p,
  Array<int> edge_tris(etris->size() + 1);
  std::copy(etris->begin(), etris->end(), edge_tris.begin());
  edge_tris[edge_tris.size() - 1] = EXTRA_TRI_INDEX;
-  Array<int> sorted_tris = sort_tris_around_edge(
+  Array<int> sorted_tris = sort_tris_around_edge(tm, e, edge_tris, edge_tris[0], dummy_tri);
      tm, tmtopo, e, edge_tris, edge_tris[0], dummy_tri);
  if (dbg_level > 0) {
    std::cout << "sorted tris = " << sorted_tris << "\n";
  }
@ -1425,6 +1460,11 @@ static int find_cell_for_point_near_edge(mpq3 p,
  return c;
 }
 static const Vert *max_x_vert(const Vert *a, const Vert *b)
 {
  return (a->co_exact.x > b->co_exact.x) ? a : b;
 }
 /**
 * Find the ambient cell -- that is, the cell that is outside
 * all other cells.
@ -1452,39 +1492,64 @@ static int find_ambient_cell(const IMesh &tm,
  /* First find a vertex with the maximum x value. */
  /* Prefer not to populate the verts in the #IMesh just for this. */
  const Vert *v_extreme;
-  mpq_class extreme_x;
+
  if (component_patches == nullptr) {
-    v_extreme = (*tm.face(0))[0];
+    v_extreme = threading::parallel_reduce(
-    extreme_x = v_extreme->co_exact.x;
+        tm.face_index_range(),
-    for (const Face *f : tm.faces()) {
+        2048,
-      for (const Vert *v : *f) {
+        (*tm.face(0))[0],
-        const mpq_class &x = v->co_exact.x;
+        [&](IndexRange range, const Vert *init) {
-        if (x > extreme_x) {
+          const Vert *ans = init;
-          v_extreme = v;
+          for (int i : range) {
-          extreme_x = x;
+            const Face *f = tm.face(i);
-        }
+            for (const Vert *v : *f) {
-      }
+              if (v->co_exact.x > ans->co_exact.x) {
-    }
+                ans = v;
              }
            }
          }
          return ans;
        },
        max_x_vert);
  }
  else {
    if (dbg_level > 0) {
      std::cout << "restrict to patches " << *component_patches << "\n";
    }
    int p0 = (*component_patches)[0];
-    v_extreme = (*tm.face(pinfo.patch(p0).tri(0)))[0];
+    v_extreme = threading::parallel_reduce(
-    extreme_x = v_extreme->co_exact.x;
+        component_patches->index_range(),
-    for (int p : *component_patches) {
+        2048,
-      for (int t : pinfo.patch(p).tris()) {
+        (*tm.face(pinfo.patch(p0).tri(0)))[0],
-        const Face *f = tm.face(t);
+        [&](IndexRange range, const Vert *init) {
-        for (const Vert *v : *f) {
+          const Vert *ans = init;
-          const mpq_class &x = v->co_exact.x;
+          for (int pi : range) {
-          if (x > extreme_x) {
+            int p = (*component_patches)[pi];
-            v_extreme = v;
+            const Vert *tris_ans = threading::parallel_reduce(
-            extreme_x = x;
+                IndexRange(pinfo.patch(p).tot_tri()),
                2048,
                init,
                [&](IndexRange tris_range, const Vert *t_init) {
                  const Vert *v_ans = t_init;
                  for (int i : tris_range) {
                    int t = pinfo.patch(p).tri(i);
                    const Face *f = tm.face(t);
                    for (const Vert *v : *f) {
                      if (v->co_exact.x > v_ans->co_exact.x) {
                        v_ans = v;
                      }
                    }
                  }
                  return v_ans;
                },
                max_x_vert);
            if (tris_ans->co_exact.x > ans->co_exact.x) {
              ans = tris_ans;
            }
          }
-        }
+          return ans;
-      }
+        },
-    }
+        max_x_vert);
  }
  if (dbg_level > 0) {
    std::cout << "v_extreme = " << v_extreme << "\n";
@ -1493,7 +1558,8 @@ static int find_ambient_cell(const IMesh &tm,
   * when projected onto the XY plane. That edge is guaranteed to
   * be on the convex hull of the mesh. */
  const Vector<Edge> &edges = tmtopo.vert_edges(v_extreme);
-  const mpq_class extreme_y = v_extreme->co_exact.y;
+  const mpq_class &extreme_x = v_extreme->co_exact.x;
  const mpq_class &extreme_y = v_extreme->co_exact.y;
  Edge ehull;
  mpq_class max_abs_slope = -1;
  for (Edge e : edges) {
@ -1514,8 +1580,8 @@ static int find_ambient_cell(const IMesh &tm,
  if (dbg_level > 0) {
    std::cout << "ehull = " << ehull << " slope = " << max_abs_slope << "\n";
  }
-  /* Sort triangles around ehull, including a dummy triangle that include a known point in ambient
+  /* Sort triangles around ehull, including a dummy triangle that include a known point in
-   * cell. */
+   * ambient cell. */
  mpq3 p_in_ambient = v_extreme->co_exact;
  p_in_ambient.x += 1;
  int c_ambient = find_cell_for_point_near_edge(p_in_ambient, ehull, tm, tmtopo, pinfo, arena);
@ -2816,7 +2882,8 @@ static IMesh raycast_patches_boolean(const IMesh &tm,
 }
 /**
 * If \a tri1 and \a tri2 have a common edge (in opposite orientation),
- * return the indices into \a tri1 and \a tri2 where that common edge starts. Else return (-1,-1).
+ * return the indices into \a tri1 and \a tri2 where that common edge starts. Else return
 * (-1,-1).
 */
 static std::pair<int, int> find_tris_common_edge(const Face &tri1, const Face &tri2)
 {
@ -3378,8 +3445,8 @@ static void dissolve_verts(IMesh *imesh, const Array<bool> dissolve, IMeshArena
 * will have an original edge that is NO_INDEX.
 * Not all triangulation edges can be removed: if they ended up non-trivially overlapping a real
 * input edge, then we need to keep it. Also, some are necessary to make the output satisfy
- * the "valid #BMesh" property: we can't produce output faces that have repeated vertices in them,
+ * the "valid #BMesh" property: we can't produce output faces that have repeated vertices in
- * or have several disconnected boundaries (e.g., faces with holes).
+ * them, or have several disconnected boundaries (e.g., faces with holes).
 */
 static IMesh polymesh_from_trimesh_with_dissolve(const IMesh &tm_out,
                                                 const IMesh &imesh_in,
--- a/source/blender/blenlib/intern/mesh_intersect.cc
+++ b/source/blender/blenlib/intern/mesh_intersect.cc
@ -43,6 +43,7 @@
 #  include "BLI_set.hh"
 #  include "BLI_span.hh"
 #  include "BLI_task.h"
 #  include "BLI_task.hh"
 #  include "BLI_threads.h"
 #  include "BLI_vector.hh"
 #  include "BLI_vector_set.hh"
@ -51,6 +52,10 @@
 #  include "BLI_mesh_intersect.hh"
 #  ifdef WITH_TBB
 #    include "tbb/parallel_sort.h"
 #  endif
 // #  define PERFDEBUG
 namespace blender::meshintersect {
@ -406,6 +411,11 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
    return add_or_find_vert(mco, co, orig);
  }
  const Vert *add_or_find_vert(Vert *vert)
  {
    return add_or_find_vert_(vert);
  }
  Face *add_face(Span<const Vert *> verts, int orig, Span<int> edge_origs, Span<bool> is_intersect)
  {
    Face *f = new Face(verts, next_face_id_++, orig, edge_origs, is_intersect);
@ -486,10 +496,9 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
 private:
  const Vert *add_or_find_vert(const mpq3 &mco, const double3 &dco, int orig)
  {
-    /* Don't allocate Vert yet, in case it is already there. */
+    Vert *vtry = new Vert(mco, dco, NO_INDEX, NO_INDEX);
    Vert vtry(mco, dco, NO_INDEX, NO_INDEX);
    const Vert *ans;
-    VSetKey vskey(&vtry);
+    VSetKey vskey(vtry);
    if (intersect_use_threading) {
 #  ifdef USE_SPINLOCK
      BLI_spin_lock(&lock_);
@ -499,7 +508,9 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
    }
    const VSetKey *lookup = vset_.lookup_key_ptr(vskey);
    if (!lookup) {
-      vskey.vert = new Vert(mco, dco, next_vert_id_++, orig);
+      vtry->id = next_vert_id_++;
      vtry->orig = orig;
      vskey.vert = vtry;  // new Vert(mco, dco, next_vert_id_++, orig);
      vset_.add_new(vskey);
      allocated_verts_.append(std::unique_ptr<Vert>(vskey.vert));
      ans = vskey.vert;
@ -510,6 +521,45 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
       * This is the intended semantics: if the Vert already
       * exists then we are merging verts and using the first-seen
       * one as the canonical one. */
      delete vtry;
      ans = lookup->vert;
    }
    if (intersect_use_threading) {
 #  ifdef USE_SPINLOCK
      BLI_spin_unlock(&lock_);
 #  else
      BLI_mutex_unlock(mutex_);
 #  endif
    }
    return ans;
  };
  const Vert *add_or_find_vert_(Vert *vtry)
  {
    const Vert *ans;
    VSetKey vskey(vtry);
    if (intersect_use_threading) {
 #  ifdef USE_SPINLOCK
      BLI_spin_lock(&lock_);
 #  else
      BLI_mutex_lock(mutex_);
 #  endif
    }
    const VSetKey *lookup = vset_.lookup_key_ptr(vskey);
    if (!lookup) {
      vtry->id = next_vert_id_++;
      vskey.vert = vtry;  // new Vert(mco, dco, next_vert_id_++, orig);
      vset_.add_new(vskey);
      allocated_verts_.append(std::unique_ptr<Vert>(vskey.vert));
      ans = vskey.vert;
    }
    else {
      /* It was a duplicate, so return the existing one.
       * Note that the returned Vert may have a different orig.
       * This is the intended semantics: if the Vert already
       * exists then we are merging verts and using the first-seen
       * one as the canonical one. */
      delete vtry;
      ans = lookup->vert;
    }
    if (intersect_use_threading) {
@ -550,6 +600,11 @@ const Vert *IMeshArena::add_or_find_vert(const mpq3 &co, int orig)
  return pimpl_->add_or_find_vert(co, orig);
 }
 const Vert *IMeshArena::add_or_find_vert(Vert *vert)
 {
  return pimpl_->add_or_find_vert(vert);
 }
 Face *IMeshArena::add_face(Span<const Vert *> verts,
                           int orig,
                           Span<int> edge_origs,
@ -633,7 +688,11 @@ void IMesh::populate_vert(int max_verts)
   * TODO: when all debugged, set fix_order = false. */
  const bool fix_order = true;
  if (fix_order) {
 #  ifdef WITH_TBB
    tbb::parallel_sort(vert_.begin(), vert_.end(), [](const Vert *a, const Vert *b) {
 #  else
    std::sort(vert_.begin(), vert_.end(), [](const Vert *a, const Vert *b) {
 #  endif
      if (a->orig != NO_INDEX && b->orig != NO_INDEX) {
        return a->orig < b->orig;
      }
@ -2187,6 +2246,14 @@ IMesh triangulate_polymesh(IMesh &imesh, IMeshArena *arena)
  Vector<Face *> face_tris;
  constexpr int estimated_tris_per_face = 3;
  face_tris.reserve(estimated_tris_per_face * imesh.face_size());
  threading::parallel_for(imesh.face_index_range(), 2048, [&](IndexRange range) {
    for (int i : range) {
      Face *f = imesh.face(i);
      if (!f->plane_populated() && f->size() >= 4) {
        f->populate_plane(false);
      }
    }
  });
  for (Face *f : imesh.faces()) {
    /* Tessellate face f, following plan similar to #BM_face_calc_tessellation. */
    int flen = f->size();
@ -2278,12 +2345,22 @@ class TriOverlaps {
    if (two_trees_no_self) {
      tree_b_ = BLI_bvhtree_new(tm.face_size(), FLT_EPSILON, 8, 6);
    }
    /* Create a Vector containing face shape. */
    Vector<int> shapes;
    shapes.resize(tm.face_size());
    threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
      for (int t : range) {
        shapes[t] = shape_fn(tm.face(t)->orig);
      }
    });
    float bbpts[6];
    for (int t : tm.face_index_range()) {
      const BoundingBox &bb = tri_bb[t];
      copy_v3_v3(bbpts, bb.min);
      copy_v3_v3(bbpts + 3, bb.max);
-      int shape = shape_fn(tm.face(t)->orig);
+      int shape = shapes[t];
      if (two_trees_no_self) {
        if (shape == 0) {
          BLI_bvhtree_insert(tree_, t, bbpts, 2);
@ -2575,11 +2652,13 @@ static void calc_subdivided_non_cluster_tris(Array<IMesh> &r_tri_subdivided,
      0, overlap_tri_range_tot, &data, calc_subdivided_tri_range_func, &settings);
  /* Now have to put in the triangles that are the same as the input ones, and not in clusters.
   */
-  for (int t : tm.face_index_range()) {
+  threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
-    if (r_tri_subdivided[t].face_size() == 0 && clinfo.tri_cluster(t) == NO_INDEX) {
+    for (int t : range) {
-      r_tri_subdivided[t] = IMesh({tm.face(t)});
+      if (r_tri_subdivided[t].face_size() == 0 && clinfo.tri_cluster(t) == NO_INDEX) {
        r_tri_subdivided[t] = IMesh({tm.face(t)});
      }
    }
-  }
+  });
 }
 /**
@ -2936,11 +3015,15 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
  double overlap_time = PIL_check_seconds_timer();
  std::cout << "intersect overlaps calculated, time = " << overlap_time - bb_calc_time << "\n";
 #  endif
-  for (int t : tm_clean->face_index_range()) {
+  Array<IMesh> tri_subdivided(tm_clean->face_size(), NoInitialization());
-    if (tri_ov.first_overlap_index(t) != -1) {
+  threading::parallel_for(tm_clean->face_index_range(), 1024, [&](IndexRange range) {
-      tm_clean->face(t)->populate_plane(true);
+    for (int t : range) {
      if (tri_ov.first_overlap_index(t) != -1) {
        tm_clean->face(t)->populate_plane(true);
      }
      new (static_cast<void *>(&tri_subdivided[t])) IMesh;
    }
-  }
+  });
 #  ifdef PERFDEBUG
  double plane_populate = PIL_check_seconds_timer();
  std::cout << "planes populated, time = " << plane_populate - overlap_time << "\n";
@ -2965,7 +3048,6 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
  doperfmax(1, clinfo.tot_cluster());
  doperfmax(2, tri_ov.overlap().size());
 #  endif
  Array<IMesh> tri_subdivided(tm_clean->face_size());
  calc_subdivided_non_cluster_tris(tri_subdivided, *tm_clean, itt_map, clinfo, tri_ov, arena);
 #  ifdef PERFDEBUG
  double subdivided_tris_time = PIL_check_seconds_timer();