From bde4d4cf4c6794635f7b976abf92d4f6395dd436 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:37:52 +0800 Subject: [PATCH 01/12] fix(hibernation): surface generation bumps in setPhase short-circuit The Phase/VMName equality short-circuit silently dropped status patches when only Generation moved, leaving ObservedGeneration stuck behind Generation and tricking clients that rely on it for completion polling. Require ObservedGeneration to also equal Generation before skipping the patch, and cover the path with a regression test. --- hibernation/reconciler.go | 4 +++- hibernation/reconciler_test.go | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/hibernation/reconciler.go b/hibernation/reconciler.go index 0c5f05c..fe6c59c 100644 --- a/hibernation/reconciler.go +++ b/hibernation/reconciler.go @@ -178,8 +178,10 @@ func (r *Reconciler) hibernationsTargetingPod(ctx context.Context, obj client.Ob // setPhase patches status, preserving timestamps on no-op updates. // On Failed->Waking re-entry, it refreshes LastTransitionTime so the wake deadline // does not inherit the stale timestamp from the previous failure. +// The short-circuit also requires ObservedGeneration to already track Generation +// so a spec bump that does not flip phase/VMName still surfaces in /status. func (r *Reconciler) setPhase(ctx context.Context, hib *cocoonv1.CocoonHibernation, phase cocoonv1.CocoonHibernationPhase, vmName string) error { - if hib.Status.Phase == phase && hib.Status.VMName == vmName { + if hib.Status.Phase == phase && hib.Status.VMName == vmName && hib.Status.ObservedGeneration == hib.Generation { return nil } refreshWakeDeadline := phase == cocoonv1.CocoonHibernationPhaseWaking && diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index c0f1187..c832ad4 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -463,6 +463,40 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) { } } +// TestSetPhasePatchesObservedGenerationOnSamePhase covers the case where a spec +// bump did not change phase or VMName but Generation moved forward; the +// short-circuit must still fire a status PATCH so ObservedGeneration tracks +// Generation (clients use it to detect a stuck reconcile). +func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) { + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 7}, + Status: cocoonv1.CocoonHibernationStatus{ + Phase: cocoonv1.CocoonHibernationPhaseHibernated, + VMName: "vk-ns-demo-0", + ObservedGeneration: 6, // lags by one rev + }, + } + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hib). + WithStatusSubresource(&cocoonv1.CocoonHibernation{}). + Build() + r := &Reconciler{Client: cli, Scheme: scheme} + + if err := r.setPhase(t.Context(), hib, cocoonv1.CocoonHibernationPhaseHibernated, "vk-ns-demo-0"); err != nil { + t.Fatalf("setPhase: %v", err) + } + + var out cocoonv1.CocoonHibernation + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil { + t.Fatalf("get hib: %v", err) + } + if out.Status.ObservedGeneration != 7 { + t.Errorf("ObservedGeneration = %d, want 7 (unchanged-phase reconcile must still surface generation bump)", out.Status.ObservedGeneration) + } +} + func TestReconcilePendingWhenPodMissing(t *testing.T) { hib := &cocoonv1.CocoonHibernation{ ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}}, From 67a0a92a56e7943eb7a91a4197c1bbbdd149e21a Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:38:44 +0800 Subject: [PATCH 02/12] fix(hibernation): surface generation bumps in markPending short-circuit Same class of bug as setPhase: the phase + message equality guard would swallow status patches when only Generation moved, hiding spec bumps behind a stale ObservedGeneration. Tighten the guard and add a regression test. --- hibernation/reconciler.go | 6 +++-- hibernation/reconciler_test.go | 40 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/hibernation/reconciler.go b/hibernation/reconciler.go index fe6c59c..bc1530b 100644 --- a/hibernation/reconciler.go +++ b/hibernation/reconciler.go @@ -220,9 +220,11 @@ func (r *Reconciler) markFailed(ctx context.Context, hib *cocoonv1.CocoonHiberna // VMName annotation) without pinning the phase to Failed. Self-heals once the // pod watcher re-enqueues the CR. Short-circuits when the phase and Ready // condition message already match, so the high-volume pod watcher does not -// generate a PATCH on every event. +// generate a PATCH on every event. The short-circuit also requires +// ObservedGeneration to already track Generation so a spec bump that lands +// while we are pending still surfaces in /status. func (r *Reconciler) markPending(ctx context.Context, hib *cocoonv1.CocoonHibernation, msg string) error { - if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending { + if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending && hib.Status.ObservedGeneration == hib.Generation { if ready := apimeta.FindStatusCondition(hib.Status.Conditions, commonk8s.ConditionTypeReady); ready != nil && ready.Message == msg { return nil } diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index c832ad4..9add9d4 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -497,6 +497,46 @@ func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) { } } +// TestMarkPendingPatchesObservedGenerationOnSameMessage mirrors the setPhase +// case: when the high-volume pod watcher re-enqueues a Pending CR with the +// same message after a spec bump, the short-circuit must still fire a status +// patch so ObservedGeneration tracks Generation. +func TestMarkPendingPatchesObservedGenerationOnSameMessage(t *testing.T) { + msg := "pod ns/demo-0 not yet present" + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 4}, + Status: cocoonv1.CocoonHibernationStatus{ + Phase: cocoonv1.CocoonHibernationPhasePending, + ObservedGeneration: 3, // lags by one rev + Conditions: []metav1.Condition{{ + Type: commonk8s.ConditionTypeReady, + Status: metav1.ConditionFalse, + Reason: conditionReasonPending, + Message: msg, + }}, + }, + } + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hib). + WithStatusSubresource(&cocoonv1.CocoonHibernation{}). + Build() + r := &Reconciler{Client: cli, Scheme: scheme} + + if err := r.markPending(t.Context(), hib, msg); err != nil { + t.Fatalf("markPending: %v", err) + } + + var out cocoonv1.CocoonHibernation + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil { + t.Fatalf("get hib: %v", err) + } + if out.Status.ObservedGeneration != 4 { + t.Errorf("ObservedGeneration = %d, want 4 (markPending must patch when generation moved even if message is unchanged)", out.Status.ObservedGeneration) + } +} + func TestReconcilePendingWhenPodMissing(t *testing.T) { hib := &cocoonv1.CocoonHibernation{ ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}}, From ae361104166a9503b030c4ca4525ca26a3d20006 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:39:32 +0800 Subject: [PATCH 03/12] fix(cocoonset): reject duplicate toolbox names in ensureToolboxes The webhook already validates this, but a stale CRD or a bypass would let one toolbox silently overwrite another's pod. Add a controller-level guard plus a unit test so the failure surfaces even when the webhook is absent or out of date. --- cocoonset/reconciler_test.go | 23 +++++++++++++++++++++++ cocoonset/toolboxes.go | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/cocoonset/reconciler_test.go b/cocoonset/reconciler_test.go index de90e80..6bcdac7 100644 --- a/cocoonset/reconciler_test.go +++ b/cocoonset/reconciler_test.go @@ -147,6 +147,29 @@ func TestEnsureToolboxesCollisionReturnsError(t *testing.T) { } } +func TestEnsureToolboxesRejectsDuplicateNames(t *testing.T) { + scheme := testScheme(t) + cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) { + cs.Spec.Toolboxes = []cocoonv1.ToolboxSpec{ + {Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:latest"}, + {Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:other"}, + } + }) + + cli := ctrlfake.NewClientBuilder().WithScheme(scheme).Build() + r := &Reconciler{Client: cli, Scheme: scheme} + classified := classifiedPods{ + sub: map[int32]*corev1.Pod{}, + toolbox: map[string]*corev1.Pod{}, + allByName: map[string]*corev1.Pod{}, + } + + _, err := r.ensureToolboxes(t.Context(), cs, classified) + if err == nil { + t.Fatal("ensureToolboxes must reject a spec with duplicate toolbox names") + } +} + func TestEnsureToolboxesIdempotentOnExistingToolbox(t *testing.T) { scheme := testScheme(t) cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) { diff --git a/cocoonset/toolboxes.go b/cocoonset/toolboxes.go index 411d393..edaaf5e 100644 --- a/cocoonset/toolboxes.go +++ b/cocoonset/toolboxes.go @@ -23,6 +23,12 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet changed := false desired := map[string]bool{} for _, tb := range cs.Spec.Toolboxes { + // Defense in depth: the webhook should already reject duplicate + // toolbox names, but an older CRD or a webhook bypass would + // otherwise let one toolbox silently overwrite another's pod. + if _, dup := desired[tb.Name]; dup { + return changed, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name) + } desired[tb.Name] = true podName := fmt.Sprintf("%s-%s", cs.Name, tb.Name) if classified.allByName[podName] != nil && classified.toolbox[tb.Name] == nil { From c1a102a08aa3959db6e87d67c3f060ded9387cd3 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:40:09 +0800 Subject: [PATCH 04/12] chore(cocoonset): warn when delete runs without epoch registry Production wires Epoch unconditionally, so a nil registry at delete time means either a misconfigured deployment or a test fixture. Either way we silently skipped tag GC and left snapshots stranded. Emit a warning so the misconfig surfaces in operator logs. --- cocoonset/delete.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cocoonset/delete.go b/cocoonset/delete.go index f8871af..55430ca 100644 --- a/cocoonset/delete.go +++ b/cocoonset/delete.go @@ -80,6 +80,11 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet } } } + } else { + // Production wires an Epoch client unconditionally; this branch only fires + // in tests or a misconfigured deployment. Log so the operator surfaces the + // configuration drift instead of silently leaking snapshot tags. + logger.Warnf(ctx, "skipping epoch tag GC for cocoonset %s/%s: registry not configured", cs.Namespace, cs.Name) } if controllerutil.ContainsFinalizer(cs, finalizerName) { From 9fc6ea0b116dc8ae8c2c1436b97e714db84ccae4 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:40:34 +0800 Subject: [PATCH 05/12] docs(cocoonset): clarify pod-delete comment in reconcileDelete The "Phase 1" label implies a "Phase 2" downstream that does not exist. Replace with a description of what actually happens. --- cocoonset/delete.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cocoonset/delete.go b/cocoonset/delete.go index 55430ca..e468ec4 100644 --- a/cocoonset/delete.go +++ b/cocoonset/delete.go @@ -44,7 +44,8 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet return ctrl.Result{}, fmt.Errorf("stash vm names: %w", err) } - // Phase 1: delete all pods and let vk-cocoon finish snapshot push. + // Issue delete; vk-cocoon completes the snapshot push during the grace + // period before we GC tags below. for i := range owned { if ctxErr := ctx.Err(); ctxErr != nil { return ctrl.Result{}, ctxErr From 883fa3b0700100dc35696a21764bd15e375c6b58 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:41:03 +0800 Subject: [PATCH 06/12] docs(cocoonset): note ForkFrom drift behavior for main agents in vmSpecMatches podSpecMatchesAgent only copies ForkFrom into want for slot > 0, so any manually-edited fork-from on a main pod is treated as drift and triggers a recreate. Document this so future readers don't restore inheritance. --- cocoonset/pods.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cocoonset/pods.go b/cocoonset/pods.go index ef97f06..061192f 100644 --- a/cocoonset/pods.go +++ b/cocoonset/pods.go @@ -210,6 +210,10 @@ func podSpecMatchesToolbox(pod *corev1.Pod, cs *cocoonv1.CocoonSet, tb cocoonv1. // construct want with VMName/ForkFrom copied from current so those // fields never trigger a spurious mismatch; struct equality lets any // future VMSpec field be covered without editing this function. +// +// ForkFrom is intentionally NOT inherited for main agents (slot 0), so a +// manually-edited fork-from on a main pod is treated as drift and triggers +// a recreate via podSpecMatchesAgent. func vmSpecMatches(got, want meta.VMSpec) bool { return got == want } From d512b1e84327a4e6bc7934b830b0d5e6d89087a4 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:42:06 +0800 Subject: [PATCH 07/12] docs(readme): correct package name from epoch/ to snapshot/ The interface package was renamed but the README directory tree, the architecture diagram, and the related-projects table still referred to it as epoch/SnapshotRegistry. Align all three with the actual snapshot.Registry name and fix the diagram box alignment that drifted along the way. --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 07c9473..742c37e 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ cocoon-operator/ ├── main.go # manager wiring + flag parsing ├── cocoonset/ # CocoonSet reconciler, pod builders, status diff ├── hibernation/ # CocoonHibernation reconciler -└── epoch/ # SnapshotRegistry interface + epoch HTTP adapter +└── snapshot/ # snapshot.Registry interface consumed by both reconcilers ``` ## Architecture @@ -32,11 +32,11 @@ cocoon-operator/ │ │ │ │ │ ▼ ▼ │ │ ┌────────────────────┐ ┌──────────────────────┐ │ -│ │ controller-runtime │ │ epoch SnapshotRegistry│ │ -│ │ Manager │ │ (HTTP via │ │ -│ │ - leader election │ │ registryclient) │ │ -│ │ - metrics :8080 │ └──────────────────────┘ │ -│ │ - probes :8081 │ │ +│ │ controller-runtime │ │ snapshot.Registry │ │ +│ │ Manager │ │ (HTTP via │ │ +│ │ - leader election │ │ registryclient) │ │ +│ │ - metrics :8080 │ └──────────────────────┘ │ +│ │ - probes :8081 │ │ │ └────────────────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -125,7 +125,7 @@ The Makefile detects Go workspace mode (`go env GOWORK`) and skips `go mod tidy` |---|---| | [cocoon-common](https://github.com/cocoonstack/cocoon-common) | CRD types, annotation contract, shared helpers | | [cocoon-webhook](https://github.com/cocoonstack/cocoon-webhook) | Admission webhook for sticky scheduling and CocoonSet validation | -| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via `SnapshotRegistry` | +| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via the `snapshot.Registry` interface | | [vk-cocoon](https://github.com/cocoonstack/vk-cocoon) | Virtual kubelet provider managing VM lifecycle | ## License From 17ea874caa93e5563c99372a317a0e2042d87c35 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:45:09 +0800 Subject: [PATCH 08/12] docs(readme): point EPOCH_CA_CERT readers at registryclient The cocoon-operator never reads EPOCH_CA_CERT itself; the value is consumed by epoch/registryclient inside NewFromEnv. Point readers at the canonical source so the env name stays accurate even if the registryclient adds siblings (EPOCH_TLS_INSECURE etc.) in the future. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 742c37e..45a1b1f 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ There is no `cocoon-vm-snapshots` ConfigMap bridge — epoch is the single sourc | `OPERATOR_LOG_LEVEL` | `info` | `projecteru2/core/log` level | | `EPOCH_URL` | `http://epoch.cocoon-system.svc:8080` | Base URL of the epoch registry | | `EPOCH_TOKEN` | unset | Bearer token (read-only is enough) | -| `EPOCH_CA_CERT` | unset | Path to PEM-encoded CA certificate for TLS verification against epoch | +| `EPOCH_CA_CERT` | unset | Read by registryclient internally — see epoch/registryclient for the exact env names. | | `METRICS_ADDR` | `:8080` | Prometheus listener | | `PROBE_ADDR` | `:8081` | healthz / readyz listener | | `LEADER_ELECT` | `true` | Enable leader election so only one replica reconciles | From 79f9edeb23e1a138ce59dbb59e28ba7cb0c16c04 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:45:53 +0800 Subject: [PATCH 09/12] chore(golangci): drop dead per-test exclusion rules run.tests is false, so test files are never linted and the _test\.go-scoped exclusions (errcheck, gosec, unparam, goconst, gocyclo) have no effect. Drop them to make the config self-consistent. Verified behavior unchanged by running both linux + darwin lint passes. --- .golangci.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 0850950..cc60f32 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -58,14 +58,6 @@ linters: - vendor - mocks - dist - rules: - - path: _test\.go - linters: - - errcheck - - gosec - - unparam - - goconst - - gocyclo formatters: enable: From b45680f4f63ca2f9b4844cbc0ae0792048c12b4a Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 17:34:21 +0800 Subject: [PATCH 10/12] chore(deps): bump cocoon-common to fbadeee69373 Adapt to commonlog.Setup now returning an error. --- go.mod | 2 +- go.sum | 4 ++-- main.go | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 1f0006b..4c1866a 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cocoonstack/cocoon-operator go 1.25.6 require ( - github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381 + github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373 github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f github.com/go-logr/logr v1.4.3 github.com/projecteru2/core v0.0.0-20241016125006-ff909eefe04c diff --git a/go.sum b/go.sum index 385c43e..cbc59c5 100644 --- a/go.sum +++ b/go.sum @@ -27,8 +27,8 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs= github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ= github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= -github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381 h1:YBouruVbi21o/oYzD3EzEA4g68hmpkLsQ9fgE9tDnaM= -github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y= +github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373 h1:8xkf1dt/2DiJv0o/7aS9nYY4wJhDUNAggYeA4B0+rs8= +github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y= github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f h1:ktTB8oZKIRRUykSnlP7dGDRNaP/GBWoeV4Sk5JQAG6Q= github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f/go.mod h1:n8RJp/oJgsHQ2ohR8a0eIWzIxLVbfnCnl2tLh8rn/9M= github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM= diff --git a/main.go b/main.go index ad4b4f5..a22e5fc 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,7 @@ package main import ( "context" "flag" + "fmt" "os" "os/signal" "syscall" @@ -48,7 +49,10 @@ func main() { flag.Parse() ctx := context.Background() - commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL") + if err := commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL"); err != nil { + fmt.Fprintf(os.Stderr, "setup log: %v\n", err) + os.Exit(1) + } // Silence controller-runtime's own logger; we use core/log instead. crlog.SetLogger(logr.Discard()) logger := log.WithFunc("main") From c77d7a9c7c2222180f0a711f72aedc0b95008a44 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 18:47:12 +0800 Subject: [PATCH 11/12] chore: trim wordy comments on code-review changes --- cocoonset/delete.go | 14 ++++---------- cocoonset/pods.go | 11 +++-------- cocoonset/toolboxes.go | 5 ++--- hibernation/reconciler.go | 18 ++++++------------ hibernation/reconciler_test.go | 8 -------- 5 files changed, 15 insertions(+), 41 deletions(-) diff --git a/cocoonset/delete.go b/cocoonset/delete.go index e468ec4..53575d5 100644 --- a/cocoonset/delete.go +++ b/cocoonset/delete.go @@ -17,11 +17,8 @@ import ( "github.com/cocoonstack/cocoon-common/meta" ) -// annotationDeleteVMNames durably records VM names for the GC step that runs -// after pods are gone, so a CocoonSet deleted before Status.Agents was patched -// (race window between r.Create(pod) and the reconcile that fills status) still -// gets every tag cleaned. Annotation, not status, because it must survive the -// pod-still-terminating requeue without depending on a second-pass status write. +// annotationDeleteVMNames records VM names for the post-pod GC step, so a +// CocoonSet deleted before Status.Agents was patched still gets every tag cleaned. const annotationDeleteVMNames = "cocoonset.cocoonstack.io/delete-vm-names" // reconcileDelete deletes all owned pods, GCs snapshots, and removes the finalizer. @@ -44,8 +41,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet return ctrl.Result{}, fmt.Errorf("stash vm names: %w", err) } - // Issue delete; vk-cocoon completes the snapshot push during the grace - // period before we GC tags below. + // vk-cocoon completes the snapshot push during the grace period before GC. for i := range owned { if ctxErr := ctx.Err(); ctxErr != nil { return ctrl.Result{}, ctxErr @@ -82,9 +78,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet } } } else { - // Production wires an Epoch client unconditionally; this branch only fires - // in tests or a misconfigured deployment. Log so the operator surfaces the - // configuration drift instead of silently leaking snapshot tags. + // Production always wires r.Epoch; this branch only fires in tests or misconfig. logger.Warnf(ctx, "skipping epoch tag GC for cocoonset %s/%s: registry not configured", cs.Namespace, cs.Name) } diff --git a/cocoonset/pods.go b/cocoonset/pods.go index 061192f..b3258f4 100644 --- a/cocoonset/pods.go +++ b/cocoonset/pods.go @@ -162,6 +162,7 @@ func newManagedPod(cs *cocoonv1.CocoonSet, podName, role, slotLabel string, sche // buildAgentPod would produce from the current CocoonSet spec. func podSpecMatchesAgent(pod *corev1.Pod, cs *cocoonv1.CocoonSet, slot int32) bool { current := meta.ParseVMSpec(pod) + // Sub-agents inherit ForkFrom; main agents leave it empty so a manual edit drifts. forkFrom := "" if slot > 0 { forkFrom = current.ForkFrom @@ -206,14 +207,8 @@ func podSpecMatchesToolbox(pod *corev1.Pod, cs *cocoonv1.CocoonSet, tb cocoonv1. return true } -// vmSpecMatches reports whether two VMSpecs are equivalent. Callers -// construct want with VMName/ForkFrom copied from current so those -// fields never trigger a spurious mismatch; struct equality lets any -// future VMSpec field be covered without editing this function. -// -// ForkFrom is intentionally NOT inherited for main agents (slot 0), so a -// manually-edited fork-from on a main pod is treated as drift and triggers -// a recreate via podSpecMatchesAgent. +// vmSpecMatches uses struct equality so any future VMSpec field is covered. +// Callers copy VMName/ForkFrom from current into want to avoid spurious mismatches. func vmSpecMatches(got, want meta.VMSpec) bool { return got == want } diff --git a/cocoonset/toolboxes.go b/cocoonset/toolboxes.go index edaaf5e..970021d 100644 --- a/cocoonset/toolboxes.go +++ b/cocoonset/toolboxes.go @@ -23,9 +23,8 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet changed := false desired := map[string]bool{} for _, tb := range cs.Spec.Toolboxes { - // Defense in depth: the webhook should already reject duplicate - // toolbox names, but an older CRD or a webhook bypass would - // otherwise let one toolbox silently overwrite another's pod. + // Defense in depth: the webhook should already reject this, but an older + // CRD or webhook bypass could let one toolbox overwrite another's pod. if _, dup := desired[tb.Name]; dup { return changed, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name) } diff --git a/hibernation/reconciler.go b/hibernation/reconciler.go index bc1530b..c681fbc 100644 --- a/hibernation/reconciler.go +++ b/hibernation/reconciler.go @@ -175,11 +175,9 @@ func (r *Reconciler) hibernationsTargetingPod(ctx context.Context, obj client.Ob return out } -// setPhase patches status, preserving timestamps on no-op updates. -// On Failed->Waking re-entry, it refreshes LastTransitionTime so the wake deadline -// does not inherit the stale timestamp from the previous failure. -// The short-circuit also requires ObservedGeneration to already track Generation -// so a spec bump that does not flip phase/VMName still surfaces in /status. +// setPhase patches status when phase, vmName, or generation moved. On +// Failed->Waking re-entry it also refreshes Ready.LastTransitionTime so the +// wake deadline doesn't carry over from the previous failure. func (r *Reconciler) setPhase(ctx context.Context, hib *cocoonv1.CocoonHibernation, phase cocoonv1.CocoonHibernationPhase, vmName string) error { if hib.Status.Phase == phase && hib.Status.VMName == vmName && hib.Status.ObservedGeneration == hib.Generation { return nil @@ -216,13 +214,9 @@ func (r *Reconciler) markFailed(ctx context.Context, hib *cocoonv1.CocoonHiberna return nil } -// markPending records that the CR is waiting on external state (pod creation, -// VMName annotation) without pinning the phase to Failed. Self-heals once the -// pod watcher re-enqueues the CR. Short-circuits when the phase and Ready -// condition message already match, so the high-volume pod watcher does not -// generate a PATCH on every event. The short-circuit also requires -// ObservedGeneration to already track Generation so a spec bump that lands -// while we are pending still surfaces in /status. +// markPending parks the CR on Pending without pinning Failed; self-heals on +// re-enqueue. Short-circuits when phase, generation, and Ready message all +// match so the pod watcher doesn't PATCH on every event. func (r *Reconciler) markPending(ctx context.Context, hib *cocoonv1.CocoonHibernation, msg string) error { if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending && hib.Status.ObservedGeneration == hib.Generation { if ready := apimeta.FindStatusCondition(hib.Status.Conditions, commonk8s.ConditionTypeReady); ready != nil && ready.Message == msg { diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index 9add9d4..ee56046 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -463,10 +463,6 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) { } } -// TestSetPhasePatchesObservedGenerationOnSamePhase covers the case where a spec -// bump did not change phase or VMName but Generation moved forward; the -// short-circuit must still fire a status PATCH so ObservedGeneration tracks -// Generation (clients use it to detect a stuck reconcile). func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) { hib := &cocoonv1.CocoonHibernation{ ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 7}, @@ -497,10 +493,6 @@ func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) { } } -// TestMarkPendingPatchesObservedGenerationOnSameMessage mirrors the setPhase -// case: when the high-volume pod watcher re-enqueues a Pending CR with the -// same message after a spec bump, the short-circuit must still fire a status -// patch so ObservedGeneration tracks Generation. func TestMarkPendingPatchesObservedGenerationOnSameMessage(t *testing.T) { msg := "pod ns/demo-0 not yet present" hib := &cocoonv1.CocoonHibernation{ From c06a7fadf05f691c403f127c0743eb451c060f09 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 18:53:16 +0800 Subject: [PATCH 12/12] fix(cocoonset): pre-validate toolbox names before any I/O Address Copilot review #4280787224. Hoists the duplicate-name check out of the main loop so an invalid spec like [a, b, a] no longer creates pods for a and b before erroring on the second a. --- README.md | 2 +- cocoonset/toolboxes.go | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 45a1b1f..e0ad956 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ There is no `cocoon-vm-snapshots` ConfigMap bridge — epoch is the single sourc | `OPERATOR_LOG_LEVEL` | `info` | `projecteru2/core/log` level | | `EPOCH_URL` | `http://epoch.cocoon-system.svc:8080` | Base URL of the epoch registry | | `EPOCH_TOKEN` | unset | Bearer token (read-only is enough) | -| `EPOCH_CA_CERT` | unset | Read by registryclient internally — see epoch/registryclient for the exact env names. | +| `EPOCH_CA_CERT` | unset | Read internally by `github.com/cocoonstack/epoch/registryclient` — see that package for the exact env names. | | `METRICS_ADDR` | `:8080` | Prometheus listener | | `PROBE_ADDR` | `:8081` | healthz / readyz listener | | `LEADER_ELECT` | `true` | Enable leader election so only one replica reconciles | diff --git a/cocoonset/toolboxes.go b/cocoonset/toolboxes.go index 970021d..cae006d 100644 --- a/cocoonset/toolboxes.go +++ b/cocoonset/toolboxes.go @@ -20,15 +20,17 @@ import ( // Returns true when cluster state was mutated. func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet, classified classifiedPods) (bool, error) { logger := log.WithFunc("cocoonset.Reconciler.ensureToolboxes") - changed := false - desired := map[string]bool{} + // Defense in depth: webhook should already reject duplicates. Validate + // upfront before any Create/Delete so a bypass can't leave partial state. + desired := make(map[string]bool, len(cs.Spec.Toolboxes)) for _, tb := range cs.Spec.Toolboxes { - // Defense in depth: the webhook should already reject this, but an older - // CRD or webhook bypass could let one toolbox overwrite another's pod. - if _, dup := desired[tb.Name]; dup { - return changed, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name) + if desired[tb.Name] { + return false, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name) } desired[tb.Name] = true + } + changed := false + for _, tb := range cs.Spec.Toolboxes { podName := fmt.Sprintf("%s-%s", cs.Name, tb.Name) if classified.allByName[podName] != nil && classified.toolbox[tb.Name] == nil { return changed, fmt.Errorf("create toolbox %s: name collision with existing pod %s", tb.Name, podName)