From dc577120c983cbd8bff7757cb316ab1011cd7283 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 19 May 2026 16:08:21 -0700
Subject: [PATCH 1/6] Speed up inlining by batching with iterative deepening
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ScheduleFunctions used to call inline_function once per inlinable Func,
which is O(N) walks of the IR. BoundsInference used a separate Inliner
that inlined every Func at once via one CSE invocation, which can be
exponentially expensive in deep inline chains because RemoveLets inside
CSE re-walks shared subtrees under nested Lets.

Neither extreme is great; the sweet spot is to batch a small constant
number of Funcs per CSE invocation, with intermediate CSE flattening
the working IR between batches. Empirically K ≈ 8 is a flat optimum
across a wide range of input sizes (the optimum drifts only as log N).

This change:

- Unifies BoundsInference's Inliner and the one in Inline.h.
- Adds Inliner::do_inlining(Expr/Stmt) which processes the to_inline
  set by iterative deepening: passes raise an active_limit through the
  add() sequence, only inlining entries below the current limit.
- Each Entry caches both its qualified body and the lowest order_id
  of any inlinable Call still inside it, so the cache is re-processed
  exactly when the current limit makes new things eligible.
- The outer loop jumps the limit directly to the lowest pending
  order_id rather than stepping by batch_size, so an Expr that only
  references far-out entries doesn't do useless intermediate passes.
- ScheduleFunctions collects consecutive inlinable groups into one
  inline_functions call, flushing before each realization so
  validate_schedule sees an up-to-date 's'.

For best performance, callers should add() in consumer-first
(reverse-topological) order. Any add() order is correct, just slower.

On a Fibonacci-shaped stress test (test/correctness/deep_inline_chain
at n=200), schedule_functions and computation bounds inference each go
from ~4s to ~0.13s. On the apps suite the speedup is more modest --
typical apps have only a few funcs in any single batch, so most of the
savings come from amortizing the per-call walk of 's' across a batch
rather than from the iterative deepening loop.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/BoundsInference.cpp                |  69 ++----
 src/Inline.cpp                         | 296 ++++++++++++++++++-------
 src/Inline.h                           |  86 ++++++-
 src/ScheduleFunctions.cpp              |  63 ++++--
 test/correctness/CMakeLists.txt        |   1 +
 test/correctness/deep_inline_chain.cpp |  38 ++++
 6 files changed, 409 insertions(+), 144 deletions(-)
 create mode 100644 test/correctness/deep_inline_chain.cpp
diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index dda848e3bc7e..76e12b0cb303 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -189,52 +189,6 @@ bool is_fused_with_others(const vector<vector<Function>> &fused_groups,
     return false;
 }
 
-// An inliner that can inline an entire set of functions at once. The inliner in
-// Inline.h only handles with one function at a time.
-class Inliner : public IRMutator {
-public:
-    std::set<Function, Function::Compare> to_inline;
-
-    Expr do_inlining(const Expr &e) {
-        return common_subexpression_elimination(mutate(e));
-    }
-
-protected:
-    std::map<Function, std::map<int, Expr>, Function::Compare> qualified_bodies;
-
-    Expr get_qualified_body(const Function &f, int idx) {
-        auto it = qualified_bodies.find(f);
-        if (it != qualified_bodies.end()) {
-            auto it2 = it->second.find(idx);
-            if (it2 != it->second.end()) {
-                return it2->second;
-            }
-        }
-        Expr e = qualify(f.name() + ".", f.values()[idx]);
-        e = do_inlining(e);
-        qualified_bodies[f][idx] = e;
-        return e;
-    }
-
-    Expr visit(const Call *op) override {
-        if (op->func.defined()) {
-            Function f(op->func);
-            if (to_inline.count(f)) {
-                auto args = mutate(op->args);
-                Expr body = get_qualified_body(f, op->value_index);
-                const vector<string> &func_args = f.args();
-                for (size_t i = 0; i < args.size(); i++) {
-                    body = Let::make(f.name() + "." + func_args[i], args[i], body);
-                }
-                return body;
-            }
-        }
-        return IRMutator::visit(op);
-    }
-
-    using IRMutator::visit;
-};
-
 class BoundsInference : public IRMutator {
 public:
     const vector<Function> &funcs;
@@ -849,16 +803,23 @@ class BoundsInference : public IRMutator {
         // Compute the intrinsic relationships between the stages of
         // the functions.
 
-        // Figure out which functions will be inlined away
+        // Figure out which functions will be inlined away.
         vector<bool> inlined(f.size());
         for (size_t i = 0; i < inlined.size(); i++) {
-            if (i < f.size() - 1 &&
-                f[i].schedule().compute_level().is_inlined() &&
-                f[i].can_be_inlined()) {
-                inlined[i] = true;
-                inliner.to_inline.insert(f[i]);
-            } else {
-                inlined[i] = false;
+            inlined[i] = (i < f.size() - 1 &&
+                          f[i].schedule().compute_level().is_inlined() &&
+                          f[i].can_be_inlined());
+        }
+        // Register them with the Inliner in consumer-first order. f is in
+        // realization (producer-first) order, so we iterate backwards: the
+        // outermost consumer of each chain is added first, the bottom
+        // producer last. The Inliner's iterative-deepening loop processes
+        // entries in add() order, so consumers go first -- their materialized
+        // bodies expose Calls to producers, which the later (deeper) passes
+        // then substitute. See Inliner's class doc for the full picture.
+        for (size_t i = inlined.size(); i > 0; i--) {
+            if (inlined[i - 1]) {
+                inliner.add(f[i - 1]);
             }
         }
 
diff --git a/src/Inline.cpp b/src/Inline.cpp
index ce829f1c7326..73c3e0d9d383 100644
--- a/src/Inline.cpp
+++ b/src/Inline.cpp
@@ -1,3 +1,5 @@
+#include <chrono>
+#include <iostream>
 #include <set>
 
 #include "CSE.h"
@@ -97,103 +99,247 @@ void validate_schedule_inlined_function(Function f) {
     }
 }
 
-class Inliner : public IRMutator {
-    using IRMutator::visit;
-
-    Function func;
-
-    Expr visit(const Call *op) override {
-        if (op->name == func.name()) {
-
-            // Mutate the args
-            auto args = mutate(op->args);
-
-            // Grab the body
-            Expr body = qualify(func.name() + ".", func.values()[op->value_index]);
-
-            const vector<string> func_args = func.args();
-
-            // Bind the args using Let nodes
-            internal_assert(args.size() == func_args.size());
-
-            for (size_t i = 0; i < args.size(); i++) {
-                if (is_const(args[i]) || args[i].as<Variable>()) {
-                    body = substitute(func.name() + "." + func_args[i], args[i], body);
-                } else {
-                    body = Let::make(func.name() + "." + func_args[i], args[i], body);
-                }
-            }
-
-            found++;
+// ---------------------------------------------------------------------------
+// Inliner: design notes
+//
+// Picking how many functions to inline per CSE invocation is a balance
+// between two penalties:
+//
+// - Doing all N functions in one pass is bad. CSE's RemoveLets, while
+//   substituting Let bindings away, can re-walk shared subtrees
+//   exponentially in their nesting depth, and the materialized body at a
+//   call site after N levels of inlining is a DAG of exactly that shape.
+//   Per-CSE-invocation cost grows roughly exponentially in the batch size.
+//
+// - Doing one function per pass is also bad. Each pass walks the entire
+//   current IR, which grows as bodies get materialized; N passes ×
+//   O(|s|) is quadratic in N.
+//
+// The exponential bites much harder than the quadratic, so we use a small
+// constant batch_size (8). Empirically the optimum drifts roughly like
+// log(N) (K≈8 at N=100, K≈12 at N=300), so a constant works well across
+// the range we measured.
+//
+// Implementation: each add() call assigns the entry an order_id (its
+// position in the add() sequence). do_inlining processes the set by
+// iterative deepening through that sequence -- a series of passes that
+// raise an active_limit by batch_size each time, with visit(Call) only
+// inlining entries whose order_id is below the current limit. The CSE
+// that runs between passes (per-Provide in the Stmt mutator, top-level
+// in the Expr form) flattens shared subtrees into named Let references,
+// so the next pass's RemoveLets input has bounded shared-Let nesting.
+//
+// Correct for any add() order: a Call that survives a pass (because it
+// got wrapped inside a body materialized by an earlier limit) is picked
+// up by a later pass once its order_id falls under the limit. But add()
+// in consumer-first (reverse-topological) order is best for performance:
+// each pass's substitutions then expose the next layer of producer Calls
+// for the following pass. With the wrong order, the work piles up in the
+// final pass once the limit hits the entries the call sites actually
+// reference, defeating the bounded-per-pass cost.
+// ---------------------------------------------------------------------------
+
+Inliner::Inliner(const Function &f) {
+    internal_assert(f.can_be_inlined()) << "Illegal to inline " << f.name() << "\n";
+    validate_schedule_inlined_function(f);
+    add(f);
+}
 
-            return body;
+void Inliner::add(const Function &f) {
+    for (int i = 0; i < f.outputs(); i++) {
+        Key k{f.name(), i};
+        auto [it, inserted] = to_inline.insert({k, Entry{}});
+        if (inserted) {
+            it->second.func = f;
+            // order_id is the entry's position in the add() sequence, used
+            // by do_inlining's iterative-deepening loop to decide when this
+            // entry first becomes eligible to inline.
+            it->second.order_id = to_inline.size() - 1;
+        }
+    }
+}
 
-        } else {
-            return IRMutator::visit(op);
+Expr Inliner::do_inlining(const Expr &e) {
+    // Single-pass path: either we're already inside an outer deepening
+    // pass (recursive call from get_qualified_body), or the set is small
+    // enough that the deepening loop wouldn't do anything useful.
+    if (active_limit != SIZE_MAX || to_inline.size() <= batch_size) {
+        return common_subexpression_elimination(mutate(e));
+    }
+    Expr result = e;
+    size_t limit = batch_size;
+    while (true) {
+        active_limit = limit;
+        min_skipped_order_id = SIZE_MAX;
+        Expr mutated = mutate(result);
+        active_limit = SIZE_MAX;
+        // Only run CSE if mutate actually changed anything; a pass that
+        // only discovered above-the-limit Calls produces an unchanged
+        // result and there's nothing for CSE to do.
+        if (!mutated.same_as(result)) {
+            result = common_subexpression_elimination(mutated);
+        }
+        // Re-processing of cached bodies in visit(Call) bubbles their
+        // remaining un-inlined Call into min_skipped_order_id, so this
+        // truly means "nothing inlinable is left anywhere in the result."
+        if (min_skipped_order_id == SIZE_MAX) {
+            break;
         }
+        // Jump directly to the next un-inlined entry instead of stepping
+        // by batch_size through regions of order-space the input doesn't
+        // reference. (No need to check limit against to_inline.size():
+        // if every entry were below the limit, none could have been
+        // skipped, so min_skipped_order_id would be SIZE_MAX already.)
+        limit = min_skipped_order_id + batch_size;
     }
+    return result;
+}
 
-    Expr visit(const Variable *op) override {
-        if (op->name == func.name() + ".buffer") {
-            const Call *call = func.is_wrapper();
-            internal_assert(call);
-            // Do a whole-image inline. Substitute the .buffer symbol
-            // for the wrapped object's .buffer symbol.
-            string buf_name;
-            if (call->call_type == Call::Halide) {
-                buf_name = call->name;
-                if (Function(call->func).outputs() > 1) {
-                    buf_name += "." + std::to_string(call->value_index);
-                }
-                buf_name += ".buffer";
-                return Variable::make(type_of<halide_buffer_t *>(), buf_name);
-            } else if (call->param.defined()) {
-                return Variable::make(type_of<halide_buffer_t *>(), call->name + ".buffer", call->param);
-            } else {
-                internal_assert(call->image.defined());
-                return Variable::make(type_of<halide_buffer_t *>(), call->name + ".buffer", call->image);
-            }
-        } else {
-            return op;
+Stmt Inliner::do_inlining(const Stmt &s) {
+    if (active_limit != SIZE_MAX || to_inline.size() <= batch_size) {
+        return mutate(s);
+    }
+    // Same deepening loop as the Expr version. No top-of-loop CSE here
+    // because the CSE that bounds per-pass work happens in two places
+    // already: the per-Provide CSE inside visit(Provide) flattens each
+    // Provide after this pass's substitutions, and any recursive
+    // do_inlining(Expr) from get_qualified_body sees active_limit set and
+    // CSEs the qualified body it produces.
+    Stmt result = s;
+    size_t limit = batch_size;
+    while (true) {
+        active_limit = limit;
+        min_skipped_order_id = SIZE_MAX;
+        result = mutate(result);
+        active_limit = SIZE_MAX;
+        if (min_skipped_order_id == SIZE_MAX) {
+            break;
         }
+        limit = min_skipped_order_id + batch_size;
     }
+    return result;
+}
 
-    Stmt visit(const Provide *op) override {
-        ScopedValue<int> old_found(found, 0);
-        Stmt stmt = IRMutator::visit(op);
+Expr Inliner::visit(const Call *op) {
+    // Match by name. Some Call nodes (e.g. those rebuilt by other passes)
+    // may not carry an op->func pointer.
+    auto it = to_inline.find({op->name, op->value_index});
+    if (it != to_inline.end()) {
+        // If this entry's order_id is past the current limit, leave the
+        // Call alone; a later pass with a higher limit will pick it up.
+        // Remember the smallest such order_id so the outer loop can jump
+        // the limit directly to it instead of stepping by batch_size.
+        if (it->second.order_id >= active_limit) {
+            min_skipped_order_id = std::min(min_skipped_order_id, it->second.order_id);
+            return IRMutator::visit(op);
+        }
+        // Below the limit: actually substitute.
+        Entry &entry = it->second;
+        const Function &func = entry.func;
+        // Mutate the args
+        auto args = mutate(op->args);
+
+        // Compute (or re-compute) the cached qualified body. The cache is
+        // (re-)processed whenever the current active_limit has expanded
+        // past the lowest pending order_id inside the cached body, so the
+        // newly-active functions get pulled in here instead of leaving
+        // the outer mutate walks to pick them up at every call site.
+        if (!entry.qualified_body.defined()) {
+            entry.qualified_body = qualify(func.name() + ".", func.values()[op->value_index]);
+            // First-time computation: fall through to the do_inlining
+            // call below.
+            entry.lowest_pending_order_id = 0;  // force re-inline below
+        }
+        if (active_limit > entry.lowest_pending_order_id) {
+            // Save/restore the outer pass's tracking around the recursive
+            // do_inlining so we can record per-entry lowest pending while
+            // still bubbling the outer's seen-skip up.
+            size_t saved_min = min_skipped_order_id;
+            min_skipped_order_id = SIZE_MAX;
+            entry.qualified_body = do_inlining(entry.qualified_body);
+            entry.lowest_pending_order_id = min_skipped_order_id;
+            min_skipped_order_id = std::min(saved_min, entry.lowest_pending_order_id);
+        }
+        Expr body = entry.qualified_body;
+
+        const vector<string> func_args = func.args();
+
+        // Bind the args using Let nodes
+        internal_assert(args.size() == func_args.size());
 
-        // TODO: making this > 1 should be desirable,
-        // but explodes compiletimes in some situations.
-        if (found > 0) {
-            stmt = common_subexpression_elimination(stmt);
+        for (size_t i = 0; i < args.size(); i++) {
+            body = Let::make(func.name() + "." + func_args[i], args[i], body);
         }
 
-        return stmt;
+        return body;
     }
+    return IRMutator::visit(op);
+}
 
-public:
-    int found = 0;
+Expr Inliner::visit(const Variable *op) {
+    // Whole-image inline for wrappers: if op is "<f>.buffer" for some inlined
+    // wrapper f, rewrite it to the wrapped buffer's Variable. Extract the
+    // "<f>" prefix and look up directly rather than scanning to_inline.
+    const string suffix = ".buffer";
+    if (op->name.size() <= suffix.size() ||
+        op->name.compare(op->name.size() - suffix.size(), suffix.size(), suffix) != 0) {
+        return op;
+    }
+    // Wrappers always have a single output, so look up by (name, 0).
+    auto it = to_inline.find({op->name.substr(0, op->name.size() - suffix.size()), 0});
+    if (it == to_inline.end()) {
+        return op;
+    }
+    const Function &func = it->second.func;
+    const Call *call = func.is_wrapper();
+    internal_assert(call);
+    if (call->call_type == Call::Halide) {
+        string buf_name = call->name;
+        if (Function(call->func).outputs() > 1) {
+            buf_name += "." + std::to_string(call->value_index);
+        }
+        buf_name += ".buffer";
+        return Variable::make(type_of<halide_buffer_t *>(), buf_name);
+    } else if (call->param.defined()) {
+        return Variable::make(type_of<halide_buffer_t *>(), call->name + ".buffer", call->param);
+    } else {
+        internal_assert(call->image.defined());
+        return Variable::make(type_of<halide_buffer_t *>(), call->name + ".buffer", call->image);
+    }
+}
 
-    Inliner(const Function &f)
-        : func(f) {
-        internal_assert(f.can_be_inlined()) << "Illegal to inline " << f.name() << "\n";
-        validate_schedule_inlined_function(f);
+Stmt Inliner::visit(const Provide *op) {
+    Stmt stmt = IRMutator::visit(op);
+    // CSE on the Provide if it changed -- IRMutator returns op unchanged
+    // if no child Expr was rewritten, so this skips CSE on Provides where
+    // no inlining (or wrapper .buffer rewrite) touched anything inside.
+    // Running CSE on the whole Stmt rather than each value/index separately
+    // catches shared subexpressions between them.
+    if (!stmt.same_as(op)) {
+        stmt = common_subexpression_elimination(stmt);
     }
-};
+    return stmt;
+}
 
 Stmt inline_function(const Stmt &s, const Function &f) {
     return Inliner(f)(s);
 }
 
 Expr inline_function(Expr e, const Function &f) {
-    Inliner i(f);
-    e = i(e);
-    // TODO: making this > 1 should be desirable,
-    // but explodes compiletimes in some situations.
-    if (i.found > 0) {
-        e = common_subexpression_elimination(e);
+    return Inliner(f).do_inlining(e);
+}
+
+Stmt inline_functions(const Stmt &s, const vector<Function> &fs) {
+    if (fs.empty()) {
+        return s;
+    }
+    Inliner i;
+    for (const Function &f : fs) {
+        internal_assert(f.can_be_inlined()) << "Illegal to inline " << f.name() << "\n";
+        validate_schedule_inlined_function(f);
+        i.add(f);
     }
-    return e;
+    return i.do_inlining(s);
 }
 
 // Inline all calls to 'f' inside 'caller'
diff --git a/src/Inline.h b/src/Inline.h
index 344e7c7ddf6d..5ccee914674d 100644
--- a/src/Inline.h
+++ b/src/Inline.h
@@ -5,12 +5,90 @@
  * Methods for replacing calls to functions with their definitions.
  */
 
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
 #include "Expr.h"
+#include "Function.h"
+#include "IRMutator.h"
 
 namespace Halide {
 namespace Internal {
 
-class Function;
+/** A mutator that inlines a set of pure functions wherever they're called
+ * in an IR. Usage:
+ *
+ *   Inliner inliner;
+ *   for (Function f : to_inline) inliner.add(f);
+ *   Stmt result = inliner.do_inlining(stmt);
+ *   // or: Expr result = inliner.do_inlining(expr);
+ *
+ * For a single function, Inliner(f) gives an equivalent shortcut and
+ * `inline_function(s/e, f)` packages it up further.
+ *
+ * For best performance, add() the functions in consumer-first
+ * (reverse-topological) order: outermost consumers first, innermost
+ * producers last. Any other order is also correct, just slower. See the
+ * implementation comments in Inline.cpp for why. */
+class Inliner : public IRMutator {
+public:
+    Inliner() = default;
+
+    /** Construct an Inliner that will inline a single function. */
+    explicit Inliner(const Function &f);
+
+    /** Insert f into the set of functions to be inlined. */
+    void add(const Function &f);
+
+    /** Inline all calls to the added functions within e/s, returning the
+     * result with CSE applied. */
+    Expr do_inlining(const Expr &e);
+    Stmt do_inlining(const Stmt &s);
+
+    /** operator() on an Inliner is a synonym for do_inlining, so callers
+     * who use the IRMutator-style i(s) syntax get the same behavior as
+     * the explicit i.do_inlining(s). Shadows the inherited inline
+     * operator() from IRMutator. */
+    Expr operator()(const Expr &e) {
+        return do_inlining(e);
+    }
+    Stmt operator()(const Stmt &s) {
+        return do_inlining(s);
+    }
+
+protected:
+    Expr visit(const Call *op) override;
+    Expr visit(const Variable *op) override;
+    Stmt visit(const Provide *op) override;
+
+    using IRMutator::visit;
+
+private:
+    /** Per-(function, value_index) inlining state. */
+    struct Entry {
+        Function func;
+        Expr qualified_body;
+        size_t order_id;
+        /** Min order_id of any inlinable Call still present inside this
+         * entry's qualified_body. SIZE_MAX means the body has no pending
+         * inlines (i.e. it's fully inlined, or hasn't been computed yet).
+         * When active_limit later exceeds this, we recompute the cached
+         * body so that the freshly-active functions are pulled in too. */
+        size_t lowest_pending_order_id = SIZE_MAX;
+    };
+    using Key = std::pair<std::string, int>;
+    std::map<Key, Entry> to_inline;
+
+    size_t active_limit = SIZE_MAX;
+    /** Min order_id of any inlinable Call still un-inlined anywhere in
+     * the working Expr/Stmt this pass. Updated by visit(Call) for Calls
+     * above the limit and bubbled up from re-processing cached bodies.
+     * SIZE_MAX means nothing's left to inline; the deepening loop stops. */
+    size_t min_skipped_order_id = SIZE_MAX;
+    static constexpr size_t batch_size = 8;
+};
 
 /** Inline a single named function, which must be pure. For a pure function to
  * be inlined, it must not have any specializations (i.e. it can only have one
@@ -21,6 +99,12 @@ Expr inline_function(Expr e, const Function &f);
 void inline_function(Function caller, const Function &f);
 // @}
 
+/** Inline a set of pure functions. Equivalent in effect to calling
+ * inline_function(s, f) for each f, but the shared Inliner lets a chain
+ * of nested inlines share work via the qualified-body cache; see Inliner
+ * above for how the batch is processed. */
+Stmt inline_functions(const Stmt &s, const std::vector<Function> &fs);
+
 /** Check if the schedule of an inlined function is legal, throwing an error
  * if it is not. */
 void validate_schedule_inlined_function(Function f);
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index a0b4a4c18876..696fc8afdb59 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -2576,13 +2576,53 @@ Stmt schedule_functions(const vector<Function> &outputs,
 
     validate_fused_groups_schedule(fused_groups, env);
 
-    for (const auto &group : reverse_view(fused_groups)) {
-        vector<Function> funcs;
-        vector<bool> is_output_list;
+    // Collect consecutive inlinable groups and apply them in one
+    // inline_functions pass. The Inliner caps how many functions are
+    // substituted per CSE invocation internally (see its class doc), so we
+    // just hand it everything pending.
+    //
+    // We flush the batch before each realization so the realization's
+    // validate_schedule sees the post-inline 's' (its callers, if they
+    // were inlined, will have been substituted in by then).
+    vector<Function> pending_inlines;
+    auto flush_pending_inlines = [&]() {
+        if (pending_inlines.empty()) {
+            return;
+        }
+        debug(1) << "Inlining group of " << pending_inlines.size()
+                 << " function(s): " << pending_inlines << "\n";
+        s = inline_functions(s, pending_inlines);
+        pending_inlines.clear();
+        debug(2) << "Lowering after inlining group of functions:\n"
+                 << s << "\n";
+    };
 
+    for (const auto &group : reverse_view(fused_groups)) {
+        vector<Function> group_funcs;
+        group_funcs.reserve(group.size());
         for (const string &name : group) {
-            Function f = env.find(name)->second;
+            group_funcs.push_back(env.find(name)->second);
+        }
+
+        if (group_should_be_inlined(group_funcs)) {
+            // Inlinable groups have a single pure func. Schedule legality is
+            // a property of the func alone; we don't need to consult 's'.
+            const Function &f = group_funcs[0];
+            validate_schedule_inlined_function(f);
+            pending_inlines.push_back(f);
+            continue;
+        }
 
+        // Realization: flush any pending inlines first so that
+        // validate_schedule and the InjectFunctionRealization walk see the
+        // post-inline 's'. In particular, ComputeLegalSchedules inside
+        // validate_schedule needs the inlined call sites to be visible to
+        // find this group's funcs.
+        flush_pending_inlines();
+
+        vector<Function> funcs;
+        vector<bool> is_output_list;
+        for (const Function &f : group_funcs) {
             bool is_output = false;
             for (const Function &o : outputs) {
                 is_output = is_output | o.same_as(f);
@@ -2605,18 +2645,13 @@ Stmt schedule_functions(const vector<Function> &outputs,
             continue;
         }
 
-        if (group_should_be_inlined(funcs)) {
-            debug(1) << "Inlining " << funcs[0].name() << "\n";
-            s = inline_function(s, funcs[0]);
-        } else {
-            debug(1) << "Injecting realization of " << funcs << "\n";
-            InjectFunctionRealization injector(funcs, is_output_list, target, env);
-            s = injector(s);
-            internal_assert(injector.found_store_level() && injector.found_compute_level() && injector.found_hoist_storage_level());
-        }
-
+        debug(1) << "Injecting realization of " << funcs << "\n";
+        InjectFunctionRealization injector(funcs, is_output_list, target, env);
+        s = injector(s);
+        internal_assert(injector.found_store_level() && injector.found_compute_level() && injector.found_hoist_storage_level());
         debug(2) << s << "\n";
     }
+    flush_pending_inlines();
 
     // We can remove the loop over root now
     const For *root_loop = s.as<For>();
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index ca43f5f2cf40..297684a000f3 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -81,6 +81,7 @@ tests(GROUPS correctness
       debug_to_file_multiple_outputs.cpp
       debug_to_file_reorder.cpp
       decompose_vector_shuffle.cpp
+      deep_inline_chain.cpp
       deferred_loop_level.cpp
       deinterleave4.cpp
       device_buffer_copies_with_profile.cpp
diff --git a/test/correctness/deep_inline_chain.cpp b/test/correctness/deep_inline_chain.cpp
new file mode 100644
index 000000000000..03464826250d
--- /dev/null
+++ b/test/correctness/deep_inline_chain.cpp
@@ -0,0 +1,38 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+// Stress-test for inlining a Fibonacci-shaped chain of pure Funcs:
+//   f_k(x) = f_{k-2}(f_{k-1}(x))
+// Each f_k references its two predecessors, so naive per-call expansion of the
+// call graph into the final IR is Fibonacci-sized — exponential in the chain
+// length. The lowering pipeline must keep this within reason; this test exists
+// to surface regressions that turn that into a crash or runaway compile.
+int main(int argc, char **argv) {
+    Var x, c;
+    std::vector<Func> funcs;
+    funcs.emplace_back(lambda(x, c, x + c));
+    funcs.emplace_back(lambda(x, c, x + c + 1));
+    funcs.emplace_back(lambda(x, c, x + c + 2));
+    funcs.emplace_back(lambda(x, c, x + c + 3));
+    for (int i = 0; i < 200; i++) {
+        size_t j = funcs.size();
+        Func next;
+        Func lut;
+        lut(x) = x * x + i;
+        Expr e = 0;
+        for (int k = 0; k < 10; k++) {
+            int l = j - 1 - k;
+            if (l < 0) break;
+            Func &f = funcs[l];
+            e += f(x, 0) * f(x, 1);
+        }
+        next(x, c) = lut(e);
+        funcs.push_back(std::move(next));
+    }
+
+    funcs.back().compile_jit(Target{"host"});
+
+    printf("Success!\n");
+    return 0;
+}

From 7a76a6fe44baee7540bb2088f2ad101c425812cd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 19 May 2026 16:13:09 -0700
Subject: [PATCH 2/6] deep_inline_chain: clarify intent and check correctness
 of output

The test was previously a smoke test -- it just compile_jit'd a deeply
inlined pipeline and printed Success! if nothing crashed. The comment
described it as Fibonacci-shaped which was no longer accurate.

Update the comment to describe what the test actually exercises (a chain
of compute_inline Funcs where each one references its last 10
predecessors through a per-level LUT), what failure modes it guards
against (hangs, crashes, wrong values), and rewrite the body to compute
the expected output value in plain C++ alongside the IR build so a
silently-incorrect inliner result will fail the test with "Mismatch"
rather than print "Success!".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 test/correctness/deep_inline_chain.cpp | 82 ++++++++++++++++++++------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/test/correctness/deep_inline_chain.cpp b/test/correctness/deep_inline_chain.cpp
index 03464826250d..7ac21f9c26fe 100644
--- a/test/correctness/deep_inline_chain.cpp
+++ b/test/correctness/deep_inline_chain.cpp
@@ -2,36 +2,80 @@
 
 using namespace Halide;
 
-// Stress-test for inlining a Fibonacci-shaped chain of pure Funcs:
-//   f_k(x) = f_{k-2}(f_{k-1}(x))
-// Each f_k references its two predecessors, so naive per-call expansion of the
-// call graph into the final IR is Fibonacci-sized — exponential in the chain
-// length. The lowering pipeline must keep this within reason; this test exists
-// to surface regressions that turn that into a crash or runaway compile.
+// Stress-test for the inliner on a long chain of inline-scheduled Funcs.
+//
+// The test builds a sequence of Funcs where each new Func calls its previous
+// 10 predecessors (at two different values of an extra coordinate) and
+// passes the sum through a fresh per-level lookup-table Func. All Funcs are
+// left at their default schedule (compute_inline), so every one of them
+// must be inlined into the output during lowering.
+//
+// Before src/Inline.cpp learned to batch and CSE between batches, this was
+// exponentially expensive in the chain length: ScheduleFunctions inlined
+// one Func at a time, paying O(N) walks of 's', and BoundsInference inlined
+// every Func at once into a giant Let-nested DAG that CSE's RemoveLets then
+// re-walked exponentially under each Let. With the batched/iterative-
+// deepening Inliner, lowering this pipeline takes well under a second.
+//
+// Failure modes this test guards against:
+//   - The test hangs or times out in CI: a regression has reintroduced the
+//     exponential/quadratic lowering cost.
+//   - The test crashes during JIT compilation or execution: the inliner
+//     produced malformed IR.
+//   - The test prints "Mismatch": the inliner produced incorrect IR that
+//     still lowers and runs, but computes the wrong value.
+//
+// The mismatch case is checked by computing one output value out-of-band
+// (by walking the Func vector at C++ level) and comparing.
 int main(int argc, char **argv) {
     Var x, c;
+
+    // Reference computation, run in C++ alongside the pipeline build to
+    // give us an expected output value to compare against.
+    std::vector<std::vector<int>> ref;  // ref[i][c] == funcs[i](x=0, c)
+
     std::vector<Func> funcs;
-    funcs.emplace_back(lambda(x, c, x + c));
-    funcs.emplace_back(lambda(x, c, x + c + 1));
-    funcs.emplace_back(lambda(x, c, x + c + 2));
-    funcs.emplace_back(lambda(x, c, x + c + 3));
-    for (int i = 0; i < 200; i++) {
-        size_t j = funcs.size();
-        Func next;
-        Func lut;
+    auto add_leaf = [&](Func f, int v0, int v1) {
+        funcs.push_back(std::move(f));
+        ref.push_back({v0, v1});
+    };
+    add_leaf(lambda(x, c, x + c), 0, 1);
+    add_leaf(lambda(x, c, x + c + 1), 1, 2);
+    add_leaf(lambda(x, c, x + c + 2), 2, 3);
+    add_leaf(lambda(x, c, x + c + 3), 3, 4);
+
+    // Number of layers added on top of the four leaf Funcs. Each layer is
+    // one Func, with a unique LUT also inlined into it. 100 layers is
+    // plenty to exercise the batched Inliner (the batch size is 8, so this
+    // produces many sequential batches).
+    const int N = 100;
+
+    for (int i = 0; i < N; i++) {
+        Func next, lut;
         lut(x) = x * x + i;
         Expr e = 0;
-        for (int k = 0; k < 10; k++) {
-            int l = j - 1 - k;
-            if (l < 0) break;
-            Func &f = funcs[l];
+        int e_ref = 0;
+        for (int k = 0; k < 10 && k < (int)funcs.size(); k++) {
+            Func &f = funcs[funcs.size() - 1 - k];
             e += f(x, 0) * f(x, 1);
+            const auto &fr = ref[ref.size() - 1 - k];
+            e_ref += fr[0] * fr[1];
         }
         next(x, c) = lut(e);
         funcs.push_back(std::move(next));
+        // lut(e_ref) = e_ref * e_ref + i. next doesn't depend on c, so
+        // both c=0 and c=1 give the same value.
+        int v = e_ref * e_ref + i;
+        ref.push_back({v, v});
     }
 
-    funcs.back().compile_jit(Target{"host"});
+    Buffer<int> out = funcs.back().realize({1, 2});
+    int expected = ref.back()[0];
+    if (out(0, 0) != expected || out(0, 1) != expected) {
+        printf("Mismatch: got (%d, %d), expected (%d, %d)\n",
+               out(0, 0), out(0, 1), expected, expected);
+        return 1;
+    }
 
     printf("Success!\n");
     return 0;

From 3d39ed010b20ccf0a478e3ad4a9fd3077d9b3fb5 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 19 May 2026 16:27:39 -0700
Subject: [PATCH 3/6] Clean up Inline.{h,cpp}: drop unused includes and stale
 comments

Remove unused includes (<chrono>, <iostream>, <set>) left over from
earlier instrumentation in Inline.cpp, and an unused <set> from Inline.h.
Also trim two comments in visit(Call) that no longer add information.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/Inline.cpp | 11 ++---------
 src/Inline.h   |  1 -
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/Inline.cpp b/src/Inline.cpp
index 73c3e0d9d383..1ce50d149007 100644
--- a/src/Inline.cpp
+++ b/src/Inline.cpp
@@ -1,7 +1,3 @@
-#include <chrono>
-#include <iostream>
-#include <set>
-
 #include "CSE.h"
 #include "Debug.h"
 #include "ExternFuncArgument.h"
@@ -221,8 +217,6 @@ Stmt Inliner::do_inlining(const Stmt &s) {
 }
 
 Expr Inliner::visit(const Call *op) {
-    // Match by name. Some Call nodes (e.g. those rebuilt by other passes)
-    // may not carry an op->func pointer.
     auto it = to_inline.find({op->name, op->value_index});
     if (it != to_inline.end()) {
         // If this entry's order_id is past the current limit, leave the
@@ -246,9 +240,8 @@ Expr Inliner::visit(const Call *op) {
         // the outer mutate walks to pick them up at every call site.
         if (!entry.qualified_body.defined()) {
             entry.qualified_body = qualify(func.name() + ".", func.values()[op->value_index]);
-            // First-time computation: fall through to the do_inlining
-            // call below.
-            entry.lowest_pending_order_id = 0;  // force re-inline below
+            // Fall through to the do_inlining call below.
+            entry.lowest_pending_order_id = 0;
         }
         if (active_limit > entry.lowest_pending_order_id) {
             // Save/restore the outer pass's tracking around the recursive
diff --git a/src/Inline.h b/src/Inline.h
index 5ccee914674d..e14156d8e45b 100644
--- a/src/Inline.h
+++ b/src/Inline.h
@@ -6,7 +6,6 @@
  */
 
 #include <map>
-#include <set>
 #include <utility>
 #include <vector>
 

From 9ee7176fd8a40dee0494eb140ecd13350d02a538 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 19 May 2026 18:39:44 -0700
Subject: [PATCH 4/6] Drop Inliner::do_inlining; use operator() only

do_inlining and operator() were synonyms. Collapse to operator(), which
is the natural entry point for an IRMutator-derived class anyway.

Also drop the ScheduleFunctions comment that leaked Inliner internals
(per-CSE batching is an Inliner implementation detail).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/BoundsInference.cpp   |  4 ++--
 src/Inline.cpp            | 22 +++++++++++-----------
 src/Inline.h              | 22 ++++++----------------
 src/ScheduleFunctions.cpp | 10 +++-------
 4 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 76e12b0cb303..c77ebfeecda7 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -640,7 +640,7 @@ class BoundsInference : public IRMutator {
             vector<pair<Expr, int>> buffers_to_annotate;
             for (const auto &arg : args) {
                 if (arg.is_expr()) {
-                    bounds_inference_args.push_back(inliner->do_inlining(arg.expr));
+                    bounds_inference_args.push_back((*inliner)(arg.expr));
                 } else if (arg.is_func()) {
                     Function input(arg.func);
                     for (int k = 0; k < input.outputs(); k++) {
@@ -854,7 +854,7 @@ class BoundsInference : public IRMutator {
         for (auto &s : stages) {
             for (auto &cond_val : s.exprs) {
                 internal_assert(cond_val.value.defined());
-                cond_val.value = inliner.do_inlining(cond_val.value);
+                cond_val.value = inliner(cond_val.value);
             }
         }
 
diff --git a/src/Inline.cpp b/src/Inline.cpp
index 1ce50d149007..099aa58b1c45 100644
--- a/src/Inline.cpp
+++ b/src/Inline.cpp
@@ -117,7 +117,7 @@ void validate_schedule_inlined_function(Function f) {
 // the range we measured.
 //
 // Implementation: each add() call assigns the entry an order_id (its
-// position in the add() sequence). do_inlining processes the set by
+// position in the add() sequence). operator() processes the set by
 // iterative deepening through that sequence -- a series of passes that
 // raise an active_limit by batch_size each time, with visit(Call) only
 // inlining entries whose order_id is below the current limit. The CSE
@@ -148,14 +148,14 @@ void Inliner::add(const Function &f) {
         if (inserted) {
             it->second.func = f;
             // order_id is the entry's position in the add() sequence, used
-            // by do_inlining's iterative-deepening loop to decide when this
+            // by operator()'s iterative-deepening loop to decide when this
             // entry first becomes eligible to inline.
             it->second.order_id = to_inline.size() - 1;
         }
     }
 }
 
-Expr Inliner::do_inlining(const Expr &e) {
+Expr Inliner::operator()(const Expr &e) {
     // Single-pass path: either we're already inside an outer deepening
     // pass (recursive call from get_qualified_body), or the set is small
     // enough that the deepening loop wouldn't do anything useful.
@@ -191,7 +191,7 @@ Expr Inliner::do_inlining(const Expr &e) {
     return result;
 }
 
-Stmt Inliner::do_inlining(const Stmt &s) {
+Stmt Inliner::operator()(const Stmt &s) {
     if (active_limit != SIZE_MAX || to_inline.size() <= batch_size) {
         return mutate(s);
     }
@@ -199,7 +199,7 @@ Stmt Inliner::do_inlining(const Stmt &s) {
     // because the CSE that bounds per-pass work happens in two places
     // already: the per-Provide CSE inside visit(Provide) flattens each
     // Provide after this pass's substitutions, and any recursive
-    // do_inlining(Expr) from get_qualified_body sees active_limit set and
+    // operator()(Expr) from get_qualified_body sees active_limit set and
     // CSEs the qualified body it produces.
     Stmt result = s;
     size_t limit = batch_size;
@@ -240,16 +240,16 @@ Expr Inliner::visit(const Call *op) {
         // the outer mutate walks to pick them up at every call site.
         if (!entry.qualified_body.defined()) {
             entry.qualified_body = qualify(func.name() + ".", func.values()[op->value_index]);
-            // Fall through to the do_inlining call below.
+            // Fall through to the recursive operator() call below.
             entry.lowest_pending_order_id = 0;
         }
         if (active_limit > entry.lowest_pending_order_id) {
             // Save/restore the outer pass's tracking around the recursive
-            // do_inlining so we can record per-entry lowest pending while
-            // still bubbling the outer's seen-skip up.
+            // call so we can record per-entry lowest pending while still
+            // bubbling the outer's seen-skip up.
             size_t saved_min = min_skipped_order_id;
             min_skipped_order_id = SIZE_MAX;
-            entry.qualified_body = do_inlining(entry.qualified_body);
+            entry.qualified_body = (*this)(entry.qualified_body);
             entry.lowest_pending_order_id = min_skipped_order_id;
             min_skipped_order_id = std::min(saved_min, entry.lowest_pending_order_id);
         }
@@ -319,7 +319,7 @@ Stmt inline_function(const Stmt &s, const Function &f) {
 }
 
 Expr inline_function(Expr e, const Function &f) {
-    return Inliner(f).do_inlining(e);
+    return Inliner(f)(e);
 }
 
 Stmt inline_functions(const Stmt &s, const vector<Function> &fs) {
@@ -332,7 +332,7 @@ Stmt inline_functions(const Stmt &s, const vector<Function> &fs) {
         validate_schedule_inlined_function(f);
         i.add(f);
     }
-    return i.do_inlining(s);
+    return i(s);
 }
 
 // Inline all calls to 'f' inside 'caller'
diff --git a/src/Inline.h b/src/Inline.h
index e14156d8e45b..c84405167e09 100644
--- a/src/Inline.h
+++ b/src/Inline.h
@@ -21,8 +21,8 @@ namespace Internal {
  *
  *   Inliner inliner;
  *   for (Function f : to_inline) inliner.add(f);
- *   Stmt result = inliner.do_inlining(stmt);
- *   // or: Expr result = inliner.do_inlining(expr);
+ *   Stmt result = inliner(stmt);
+ *   // or: Expr result = inliner(expr);
  *
  * For a single function, Inliner(f) gives an equivalent shortcut and
  * `inline_function(s/e, f)` packages it up further.
@@ -42,20 +42,10 @@ class Inliner : public IRMutator {
     void add(const Function &f);
 
     /** Inline all calls to the added functions within e/s, returning the
-     * result with CSE applied. */
-    Expr do_inlining(const Expr &e);
-    Stmt do_inlining(const Stmt &s);
-
-    /** operator() on an Inliner is a synonym for do_inlining, so callers
-     * who use the IRMutator-style i(s) syntax get the same behavior as
-     * the explicit i.do_inlining(s). Shadows the inherited inline
-     * operator() from IRMutator. */
-    Expr operator()(const Expr &e) {
-        return do_inlining(e);
-    }
-    Stmt operator()(const Stmt &s) {
-        return do_inlining(s);
-    }
+     * result with CSE applied. Shadows the inherited inline operator()
+     * from IRMutator. */
+    Expr operator()(const Expr &e);
+    Stmt operator()(const Stmt &s);
 
 protected:
     Expr visit(const Call *op) override;
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 696fc8afdb59..6e3c319079f7 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -2577,13 +2577,9 @@ Stmt schedule_functions(const vector<Function> &outputs,
     validate_fused_groups_schedule(fused_groups, env);
 
     // Collect consecutive inlinable groups and apply them in one
-    // inline_functions pass. The Inliner caps how many functions are
-    // substituted per CSE invocation internally (see its class doc), so we
-    // just hand it everything pending.
-    //
-    // We flush the batch before each realization so the realization's
-    // validate_schedule sees the post-inline 's' (its callers, if they
-    // were inlined, will have been substituted in by then).
+    // inline_functions pass. We flush the batch before each realization so
+    // the realization's validate_schedule sees the post-inline 's' (its
+    // callers, if they were inlined, will have been substituted in by then).
     vector<Function> pending_inlines;
     auto flush_pending_inlines = [&]() {
         if (pending_inlines.empty()) {

From d8ca3975f82a2a76ea2dac96dba87583f8001c91 Mon Sep 17 00:00:00 2001
From: "halide-ci[bot]" <266445882+halide-ci[bot]@users.noreply.github.com>
Date: Wed, 20 May 2026 01:55:55 +0000
Subject: [PATCH 5/6] Apply pre-commit auto-fixes

---
 src/Inline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Inline.cpp b/src/Inline.cpp
index 099aa58b1c45..a2d3407114d8 100644
--- a/src/Inline.cpp
+++ b/src/Inline.cpp
@@ -1,10 +1,10 @@
+#include "Inline.h"
 #include "CSE.h"
 #include "Debug.h"
 #include "ExternFuncArgument.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
-#include "Inline.h"
 #include "Qualify.h"
 #include "Substitute.h"
 

From 6b03619b53ed158f37f25d877b7558e66882d7d8 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 20 May 2026 12:57:35 -0700
Subject: [PATCH 6/6] Fix two CI failures from the Inliner refactor

1. error_hoist_storage_without_compute_at: the new ScheduleFunctions
   short-circuit for inlinable groups skipped validate_schedule, which is
   where the hoist_storage-without-compute_at error lives. Calling the
   full validate_schedule on inlinable funcs is also wrong though: it
   walks 's' for call sites via ComputeLegalSchedules, which fails for
   batched inline chains whose inner call sites haven't been exposed in
   's' yet (e.g. repeat_edge called from a not-yet-flushed downsampled).
   So we run only the schedule-property checks directly here.

2. truncated_pyramid/fft: the qualified-body cache only propagated its
   lowest_pending_order_id up to the deepening loop's min_skipped when
   re-processing the cache. On a cache hit without re-processing, the
   cached body still has un-inlined Calls at order_id == lowest_pending
   but the outer loop wasn't told, so it could terminate while those
   Calls remained. Propagate lowest_pending up unconditionally.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/Inline.cpp            | 20 +++++++-------------
 src/ScheduleFunctions.cpp | 19 +++++++++++++++++--
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/Inline.cpp b/src/Inline.cpp
index a2d3407114d8..79bb80610e90 100644
--- a/src/Inline.cpp
+++ b/src/Inline.cpp
@@ -156,9 +156,6 @@ void Inliner::add(const Function &f) {
 }
 
 Expr Inliner::operator()(const Expr &e) {
-    // Single-pass path: either we're already inside an outer deepening
-    // pass (recursive call from get_qualified_body), or the set is small
-    // enough that the deepening loop wouldn't do anything useful.
     if (active_limit != SIZE_MAX || to_inline.size() <= batch_size) {
         return common_subexpression_elimination(mutate(e));
     }
@@ -233,26 +230,23 @@ Expr Inliner::visit(const Call *op) {
         // Mutate the args
         auto args = mutate(op->args);
 
-        // Compute (or re-compute) the cached qualified body. The cache is
-        // (re-)processed whenever the current active_limit has expanded
-        // past the lowest pending order_id inside the cached body, so the
-        // newly-active functions get pulled in here instead of leaving
-        // the outer mutate walks to pick them up at every call site.
+        // Compute (or re-process) the cached qualified body when the
+        // current active_limit lets us pull more inlinable Calls into it.
         if (!entry.qualified_body.defined()) {
             entry.qualified_body = qualify(func.name() + ".", func.values()[op->value_index]);
-            // Fall through to the recursive operator() call below.
             entry.lowest_pending_order_id = 0;
         }
         if (active_limit > entry.lowest_pending_order_id) {
-            // Save/restore the outer pass's tracking around the recursive
-            // call so we can record per-entry lowest pending while still
-            // bubbling the outer's seen-skip up.
             size_t saved_min = min_skipped_order_id;
             min_skipped_order_id = SIZE_MAX;
             entry.qualified_body = (*this)(entry.qualified_body);
             entry.lowest_pending_order_id = min_skipped_order_id;
-            min_skipped_order_id = std::min(saved_min, entry.lowest_pending_order_id);
+            min_skipped_order_id = std::min(saved_min, min_skipped_order_id);
         }
+        // Whether or not we re-processed, the cached body still has Calls
+        // at order_id == lowest_pending un-inlined. Propagate that up so
+        // the outer deepening loop knows there's more work.
+        min_skipped_order_id = std::min(min_skipped_order_id, entry.lowest_pending_order_id);
         Expr body = entry.qualified_body;
 
         const vector<string> func_args = func.args();
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 6e3c319079f7..62ec36399a45 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -2601,9 +2601,24 @@ Stmt schedule_functions(const vector<Function> &outputs,
         }
 
         if (group_should_be_inlined(group_funcs)) {
-            // Inlinable groups have a single pure func. Schedule legality is
-            // a property of the func alone; we don't need to consult 's'.
+            // Inlinable groups have a single pure func. Check the
+            // schedule-property errors directly here; we can't call
+            // validate_schedule (which walks 's' for call sites) because
+            // batched inline chains may not yet have their inner call
+            // sites exposed in 's'.
             const Function &f = group_funcs[0];
+            const LoopLevel &store_at = f.schedule().store_level();
+            const LoopLevel &hoist_storage_at = f.schedule().hoist_storage_level();
+            if (store_at.is_root()) {
+                user_error << "Func \"" << f.name() << "\" is scheduled store_root(), but is inlined. Funcs that use store_root must also call compute_root or compute_at.\n";
+            } else if (!store_at.is_inlined()) {
+                user_error << "Func \"" << f.name() << "\" is scheduled store_at(), but is inlined. Funcs that use store_at must also call compute_at.\n";
+            }
+            if (hoist_storage_at.is_root()) {
+                user_error << "Func \"" << f.name() << "\" is scheduled hoist_storage_root(), but is inlined. Funcs that use hoist_storage_root must also call compute_root or compute_at.\n";
+            } else if (!hoist_storage_at.is_inlined()) {
+                user_error << "Func \"" << f.name() << "\" is scheduled hoist_storage(), but is inlined. Funcs that use hoist_storage_root must also call compute_at.\n";
+            }
             validate_schedule_inlined_function(f);
             pending_inlines.push_back(f);
             continue;