From e269dd2aa1a8168243ac1ceae48ea646f25037bd Mon Sep 17 00:00:00 2001 From: rongxin Date: Fri, 8 May 2026 11:29:36 +0800 Subject: [PATCH] fix: resolve readiness WaitReady blocking for 5 minutes on startup Backport fixes from upstream apache/apisix-ingress-controller#2663. Root cause: readiness.Start() is asynchronous. If a controller's reconcile loop calls Done() before Start() finishes registering resources, Done() finds no state entry and returns early. The resource is never removed from state, causing WaitReady to block until the 5-minute timeout. Changes: - Done() now waits for Start() to complete (<-r.started) before operating on state, eliminating the race condition - WaitReady() returns false on timeout instead of true (semantic fix: timed-out != ready) - Remove unnecessary mutex in registerState() since Done() is now guaranteed to run after Start() closes r.started - Add log statements for easier debugging of readiness lifecycle --- internal/manager/readiness/manager.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/internal/manager/readiness/manager.go b/internal/manager/readiness/manager.go index eca380eb..b140bd83 100644 --- a/internal/manager/readiness/manager.go +++ b/internal/manager/readiness/manager.go @@ -125,7 +125,8 @@ func (r *readinessManager) Start(ctx context.Context) error { }) } if len(expected) > 0 { - r.log.V(1).Info("registering readiness state", "gvk", gvk, "expected", expected) + r.log.Info("registering readiness state", "gvk", gvk, "registered_count", len(expected)) + r.log.V(1).Info("registered resources for readiness", "gvk", gvk, "resources", expected) r.registerState(gvk, expected) } } @@ -135,13 +136,12 @@ func (r *readinessManager) Start(ctx context.Context) error { r.isReady.Store(true) close(r.done) } + r.log.Info("readiness manager started") }) return err } func (r *readinessManager) registerState(gvk schema.GroupVersionKind, list []k8stypes.NamespacedName) { - r.mu.Lock() - defer r.mu.Unlock() if _, ok := r.state[gvk]; !ok { r.state[gvk] = make(map[k8stypes.NamespacedName]struct{}) } @@ -155,9 +155,12 @@ func (r *readinessManager) Done(obj client.Object, nn k8stypes.NamespacedName) { if r.IsReady() { return } + <-r.started + r.mu.Lock() defer r.mu.Unlock() gvk := types.GvkOf(obj) + r.log.Info("marking resource as done", "gvk", gvk, "name", nn, "state_count", len(r.state[gvk])) if _, ok := r.state[gvk]; !ok { return } @@ -191,7 +194,7 @@ func (r *readinessManager) WaitReady(ctx context.Context, timeout time.Duration) case <-ctx.Done(): return false case <-time.After(timeout): - return true + return false case <-r.done: return true }