From 6e5602a3c2d657fdb0df25771ef364d148c768cb Mon Sep 17 00:00:00 2001 From: mscherer Date: Thu, 14 May 2026 02:39:36 +0200 Subject: [PATCH] Add red 'stalled' status to admin dashboard banner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dashboard banner now distinguishes three states instead of two: - green Queue Running (heartbeat < dashboardIdleAfter, 60s default) - yellow Queue Idle (heartbeat >= dashboardIdleAfter, or no heartbeat and no backlog) - red Queue Stalled (heartbeat >= dashboardStalledAfter, 120s default, with a pending backlog AND no in-flight job; OR no worker has reported at all and there is a pending backlog or stuck in-flight job) Red surfaces the case the old "Queue Idle" yellow blurred away: jobs are piling up and nothing is processing them. The previous banner also showed a muted info notice ("No queue status available") when every worker row had aged past Queue.defaultRequeueTimeout — the worst-case full cron outage. That path now produces the red banner too. The conditions are tuned to avoid false positives: - workers == 0 alone does not trigger red. In cron-driven mode that is the normal idle state for a quiet system. - runningJobs > 0 keeps a busy worker out of red. Heartbeats fire at the top of each loop, not during runJob(), so a long-running task (>2 min) with more pending behind it would otherwise look stalled by heartbeat age alone. When red, the banner expands with a diagnostic grid (last activity absolute + relative, workers, pending) and a context-specific cause hint that names the most likely fault and the relevant CLI command. Thresholds are exposed as two new config keys: - Queue.dashboardIdleAfter (default 60, seconds) - Queue.dashboardStalledAfter (default 120, seconds) Defaults are deliberate UI policy — human-perceptible 1-min / 2-min boundaries — not derived from queue mechanics. None of the existing config knobs (workerLifetime, defaultRequeueTimeout, sleeptime) actually mean "dashboard heartbeat freshness", so deriving from them coupled the banner to unrelated semantics. Installations with unusual cron cadence (e.g. slow exitwhennothingtodo cron) should raise dashboardStalledAfter past the cron interval to avoid false-red between ticks. --- config/app.example.php | 17 +++ templates/Admin/Queue/index.php | 184 +++++++++++++++++++++++++------- templates/layout/queue.php | 16 +++ 3 files changed, 180 insertions(+), 37 deletions(-) diff --git a/config/app.example.php b/config/app.example.php index ae54705c..c12190f7 100644 --- a/config/app.example.php +++ b/config/app.example.php @@ -127,6 +127,23 @@ // auto-refresh dashboard in seconds (0 = disabled) 'dashboardAutoRefresh' => 0, + // Status-banner thresholds on the admin dashboard, in seconds. The + // banner has three colors: green (running), yellow (idle), red + // (stalled — action required). + // running: fresh heartbeat (< dashboardIdleAfter) + // idle: stale heartbeat, no backlog (>= dashboardIdleAfter) + // stalled: >= dashboardStalledAfter with a pending backlog and no + // in-flight job, OR no worker reporting with backlog + // Defaults (60 / 120) are deliberate UI policy — human-perceptible + // 1-min / 2-min boundaries — not derived from queue mechanics, since + // no existing config knob (workerLifetime, defaultRequeueTimeout, + // sleeptime) actually means "heartbeat freshness." Override for + // unusual cadences (e.g. slow cron in `exitwhennothingtodo` mode — + // raise dashboardStalledAfter past the cron interval to avoid + // false-red between ticks). + 'dashboardIdleAfter' => 60, + 'dashboardStalledAfter' => 120, + // Standalone mode for admin controllers: // - false (default): Extends App\Controller\AppController, inherits app auth/components // - true: Isolated admin, skips app's AppController setup diff --git a/templates/Admin/Queue/index.php b/templates/Admin/Queue/index.php index dc9ea613..177c971d 100644 --- a/templates/Admin/Queue/index.php +++ b/templates/Admin/Queue/index.php @@ -25,58 +25,168 @@ use Cake\Core\Configure; +// Banner thresholds are a UI policy, not a system mechanic: how long the +// dashboard waits before nagging the admin. Defaults are 60s yellow / 120s +// red — human-perceptible minute boundaries — and don't derive from queue +// config knobs because none of them actually mean "heartbeat freshness": +// - workerLifetime is an exit policy (a 1h-lifetime worker still +// heartbeats every ~sleeptime when idle). +// - defaultRequeueTimeout is the job-reassignment safeguard, tuned for +// max job duration (often 5-10 min). +// - sleeptime is closest to the real heartbeat cadence for an idle +// worker, but busy workers don't sleep — and we already cover the +// busy-worker case via the `runningJobs > 0` escape hatch below. +// Override these for installations with unusual cron cadence (e.g. slow +// `exitwhennothingtodo` cron — raise dashboardStalledAfter past the cron +// interval to avoid false-red between ticks). +$idleAfterSeconds = (int)Configure::read('Queue.dashboardIdleAfter', 60); +$stalledAfterSeconds = (int)Configure::read('Queue.dashboardStalledAfter', 120); ?> - - 0` (derived in the controller from + * `fetched IS NOT NULL AND completed IS NULL`) keeps a busy worker out of + * red: heartbeats fire at the top of each loop, not during long jobs, so + * a >2 min task with more pending behind it would look stalled by heartbeat + * age alone. + * - When `$status` is empty, `QueueProcessesTable::status()` filtered every + * worker row past `Queue.defaultRequeueTimeout`. In that case a pending + * backlog or stuck in-flight job is unambiguously a problem. + */ +$state = 'idle'; +$time = null; +$relTime = null; + +if ($status) { /** @var \Cake\I18n\DateTime $time */ $time = $status['time']; - $running = $time->addMinutes(1)->isFuture(); + $now = new \Cake\I18n\DateTime(); + $secondsSinceActivity = max(0, $now->getTimestamp() - $time->getTimestamp()); + + $state = 'running'; + if ($secondsSinceActivity >= $idleAfterSeconds) { + $state = 'idle'; + } + if ($secondsSinceActivity >= $stalledAfterSeconds && $pendingJobs > 0 && $runningJobs === 0) { + $state = 'stalled'; + } + $relTime = method_exists($this->Time, 'relLengthOfTime') ? $this->Time->relLengthOfTime($status['time']) : $this->Time->timeAgoInWords($status['time']); - ?> -
-
-
- - - - - - - -
- -
+} elseif ($pendingJobs > 0 || $runningJobs > 0) { + // No worker has reported within `Queue.defaultRequeueTimeout`, yet jobs are + // either waiting (pending) or marked in-flight (fetched but not completed). + // Pending + no heartbeat = cron likely dead. Running + no heartbeat = worker + // died mid-job and left a stale fetched row, OR a job legitimately ran past + // the requeue timeout (which is itself a misconfiguration worth surfacing). + $state = 'stalled'; +} + +$stateMeta = [ + 'running' => ['icon' => 'check-circle', 'iconColor' => 'text-success', 'label' => __d('queue', 'Queue Running')], + 'idle' => ['icon' => 'pause-circle', 'iconColor' => 'text-warning', 'label' => __d('queue', 'Queue Idle')], + 'stalled' => ['icon' => 'exclamation-circle', 'iconColor' => 'text-danger', 'label' => __d('queue', 'Queue Stalled')], +][$state]; +?> +
+
+
+ + + +
+ + + + +
+ • - Html->link( - __d('queue', '{0} worker(s)', $workers), - ['action' => 'processes'], - ['class' => 'text-decoration-none'] - ) ?> + + • - -
+ + Html->link( + __d('queue', '{0} worker(s)', $workers), + ['action' => 'processes'], + ['class' => 'text-decoration-none'] + ) ?> + • +
-
- Html->link( - '' . __d('queue', 'Manage Workers'), - ['action' => 'processes'], - ['class' => 'btn btn-sm btn-outline-dark', 'escapeTitle' => false] - ) ?> -
+
+
+ Html->link( + '' . __d('queue', 'Manage Workers'), + ['action' => 'processes'], + ['class' => 'btn btn-sm btn-outline-dark', 'escapeTitle' => false] + ) ?>
- -
- - -
- + + 0) { + $causeHint = __d('queue', 'A job is marked in-flight but no worker is reporting. The worker likely crashed mid-job — reset stale fetched jobs and check cron.'); + } elseif (!$status) { + $causeHint = __d('queue', 'No worker has reported in. Cron is likely not firing — check that {0} runs on at least one server.', 'bin/cake queue run'); + } elseif ($workers === 0) { + $causeHint = __d('queue', 'Jobs are waiting but no workers are running. Check that {0} cron is firing on at least one server.', 'bin/cake queue run'); + } else { + $causeHint = __d('queue', "Jobs are waiting but aren't being picked up. Workers may have crashed — restart the queue or clean up stale processes."); + } + ?> +
+
+
+
+ + i18nFormat('yyyy-MM-dd HH:mm:ss')) ?> + · + + + +
+
+
+ + +
+
+
+ + +
+
+
+ + +
+
+ +
diff --git a/templates/layout/queue.php b/templates/layout/queue.php index 84b7126a..ef4f4a80 100644 --- a/templates/layout/queue.php +++ b/templates/layout/queue.php @@ -248,6 +248,22 @@ border: 1px solid #ffc107; } + .status-banner.status-stalled { + background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%); + border: 1px solid #dc3545; + border-left-width: 4px; + } + + .status-banner.status-stalled .stalled-details dt { + padding-top: 0.125rem; + } + + .status-banner.status-stalled .stalled-details code { + background: rgba(220, 53, 69, 0.08); + padding: 0.125rem 0.375rem; + border-radius: 0.25rem; + } + .status-banner .status-icon { font-size: 1.5rem; }