diff --git a/.golangci.yml b/.golangci.yml index 0850950..cc60f32 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -58,14 +58,6 @@ linters: - vendor - mocks - dist - rules: - - path: _test\.go - linters: - - errcheck - - gosec - - unparam - - goconst - - gocyclo formatters: enable: diff --git a/README.md b/README.md index 07c9473..e0ad956 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ cocoon-operator/ ├── main.go # manager wiring + flag parsing ├── cocoonset/ # CocoonSet reconciler, pod builders, status diff ├── hibernation/ # CocoonHibernation reconciler -└── epoch/ # SnapshotRegistry interface + epoch HTTP adapter +└── snapshot/ # snapshot.Registry interface consumed by both reconcilers ``` ## Architecture @@ -32,11 +32,11 @@ cocoon-operator/ │ │ │ │ │ ▼ ▼ │ │ ┌────────────────────┐ ┌──────────────────────┐ │ -│ │ controller-runtime │ │ epoch SnapshotRegistry│ │ -│ │ Manager │ │ (HTTP via │ │ -│ │ - leader election │ │ registryclient) │ │ -│ │ - metrics :8080 │ └──────────────────────┘ │ -│ │ - probes :8081 │ │ +│ │ controller-runtime │ │ snapshot.Registry │ │ +│ │ Manager │ │ (HTTP via │ │ +│ │ - leader election │ │ registryclient) │ │ +│ │ - metrics :8080 │ └──────────────────────┘ │ +│ │ - probes :8081 │ │ │ └────────────────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -73,7 +73,7 @@ There is no `cocoon-vm-snapshots` ConfigMap bridge — epoch is the single sourc | `OPERATOR_LOG_LEVEL` | `info` | `projecteru2/core/log` level | | `EPOCH_URL` | `http://epoch.cocoon-system.svc:8080` | Base URL of the epoch registry | | `EPOCH_TOKEN` | unset | Bearer token (read-only is enough) | -| `EPOCH_CA_CERT` | unset | Path to PEM-encoded CA certificate for TLS verification against epoch | +| `EPOCH_CA_CERT` | unset | Read internally by `github.com/cocoonstack/epoch/registryclient` — see that package for the exact env names. | | `METRICS_ADDR` | `:8080` | Prometheus listener | | `PROBE_ADDR` | `:8081` | healthz / readyz listener | | `LEADER_ELECT` | `true` | Enable leader election so only one replica reconciles | @@ -125,7 +125,7 @@ The Makefile detects Go workspace mode (`go env GOWORK`) and skips `go mod tidy` |---|---| | [cocoon-common](https://github.com/cocoonstack/cocoon-common) | CRD types, annotation contract, shared helpers | | [cocoon-webhook](https://github.com/cocoonstack/cocoon-webhook) | Admission webhook for sticky scheduling and CocoonSet validation | -| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via `SnapshotRegistry` | +| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via the `snapshot.Registry` interface | | [vk-cocoon](https://github.com/cocoonstack/vk-cocoon) | Virtual kubelet provider managing VM lifecycle | ## License diff --git a/cocoonset/delete.go b/cocoonset/delete.go index f8871af..53575d5 100644 --- a/cocoonset/delete.go +++ b/cocoonset/delete.go @@ -17,11 +17,8 @@ import ( "github.com/cocoonstack/cocoon-common/meta" ) -// annotationDeleteVMNames durably records VM names for the GC step that runs -// after pods are gone, so a CocoonSet deleted before Status.Agents was patched -// (race window between r.Create(pod) and the reconcile that fills status) still -// gets every tag cleaned. Annotation, not status, because it must survive the -// pod-still-terminating requeue without depending on a second-pass status write. +// annotationDeleteVMNames records VM names for the post-pod GC step, so a +// CocoonSet deleted before Status.Agents was patched still gets every tag cleaned. const annotationDeleteVMNames = "cocoonset.cocoonstack.io/delete-vm-names" // reconcileDelete deletes all owned pods, GCs snapshots, and removes the finalizer. @@ -44,7 +41,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet return ctrl.Result{}, fmt.Errorf("stash vm names: %w", err) } - // Phase 1: delete all pods and let vk-cocoon finish snapshot push. + // vk-cocoon completes the snapshot push during the grace period before GC. for i := range owned { if ctxErr := ctx.Err(); ctxErr != nil { return ctrl.Result{}, ctxErr @@ -80,6 +77,9 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet } } } + } else { + // Production always wires r.Epoch; this branch only fires in tests or misconfig. + logger.Warnf(ctx, "skipping epoch tag GC for cocoonset %s/%s: registry not configured", cs.Namespace, cs.Name) } if controllerutil.ContainsFinalizer(cs, finalizerName) { diff --git a/cocoonset/pods.go b/cocoonset/pods.go index ef97f06..b3258f4 100644 --- a/cocoonset/pods.go +++ b/cocoonset/pods.go @@ -162,6 +162,7 @@ func newManagedPod(cs *cocoonv1.CocoonSet, podName, role, slotLabel string, sche // buildAgentPod would produce from the current CocoonSet spec. func podSpecMatchesAgent(pod *corev1.Pod, cs *cocoonv1.CocoonSet, slot int32) bool { current := meta.ParseVMSpec(pod) + // Sub-agents inherit ForkFrom; main agents leave it empty so a manual edit drifts. forkFrom := "" if slot > 0 { forkFrom = current.ForkFrom @@ -206,10 +207,8 @@ func podSpecMatchesToolbox(pod *corev1.Pod, cs *cocoonv1.CocoonSet, tb cocoonv1. return true } -// vmSpecMatches reports whether two VMSpecs are equivalent. Callers -// construct want with VMName/ForkFrom copied from current so those -// fields never trigger a spurious mismatch; struct equality lets any -// future VMSpec field be covered without editing this function. +// vmSpecMatches uses struct equality so any future VMSpec field is covered. +// Callers copy VMName/ForkFrom from current into want to avoid spurious mismatches. func vmSpecMatches(got, want meta.VMSpec) bool { return got == want } diff --git a/cocoonset/reconciler_test.go b/cocoonset/reconciler_test.go index de90e80..6bcdac7 100644 --- a/cocoonset/reconciler_test.go +++ b/cocoonset/reconciler_test.go @@ -147,6 +147,29 @@ func TestEnsureToolboxesCollisionReturnsError(t *testing.T) { } } +func TestEnsureToolboxesRejectsDuplicateNames(t *testing.T) { + scheme := testScheme(t) + cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) { + cs.Spec.Toolboxes = []cocoonv1.ToolboxSpec{ + {Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:latest"}, + {Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:other"}, + } + }) + + cli := ctrlfake.NewClientBuilder().WithScheme(scheme).Build() + r := &Reconciler{Client: cli, Scheme: scheme} + classified := classifiedPods{ + sub: map[int32]*corev1.Pod{}, + toolbox: map[string]*corev1.Pod{}, + allByName: map[string]*corev1.Pod{}, + } + + _, err := r.ensureToolboxes(t.Context(), cs, classified) + if err == nil { + t.Fatal("ensureToolboxes must reject a spec with duplicate toolbox names") + } +} + func TestEnsureToolboxesIdempotentOnExistingToolbox(t *testing.T) { scheme := testScheme(t) cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) { diff --git a/cocoonset/toolboxes.go b/cocoonset/toolboxes.go index 411d393..cae006d 100644 --- a/cocoonset/toolboxes.go +++ b/cocoonset/toolboxes.go @@ -20,10 +20,17 @@ import ( // Returns true when cluster state was mutated. func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet, classified classifiedPods) (bool, error) { logger := log.WithFunc("cocoonset.Reconciler.ensureToolboxes") - changed := false - desired := map[string]bool{} + // Defense in depth: webhook should already reject duplicates. Validate + // upfront before any Create/Delete so a bypass can't leave partial state. + desired := make(map[string]bool, len(cs.Spec.Toolboxes)) for _, tb := range cs.Spec.Toolboxes { + if desired[tb.Name] { + return false, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name) + } desired[tb.Name] = true + } + changed := false + for _, tb := range cs.Spec.Toolboxes { podName := fmt.Sprintf("%s-%s", cs.Name, tb.Name) if classified.allByName[podName] != nil && classified.toolbox[tb.Name] == nil { return changed, fmt.Errorf("create toolbox %s: name collision with existing pod %s", tb.Name, podName) diff --git a/go.mod b/go.mod index 1f0006b..4c1866a 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cocoonstack/cocoon-operator go 1.25.6 require ( - github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381 + github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373 github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f github.com/go-logr/logr v1.4.3 github.com/projecteru2/core v0.0.0-20241016125006-ff909eefe04c diff --git a/go.sum b/go.sum index 385c43e..cbc59c5 100644 --- a/go.sum +++ b/go.sum @@ -27,8 +27,8 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs= github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ= github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= -github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381 h1:YBouruVbi21o/oYzD3EzEA4g68hmpkLsQ9fgE9tDnaM= -github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y= +github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373 h1:8xkf1dt/2DiJv0o/7aS9nYY4wJhDUNAggYeA4B0+rs8= +github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y= github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f h1:ktTB8oZKIRRUykSnlP7dGDRNaP/GBWoeV4Sk5JQAG6Q= github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f/go.mod h1:n8RJp/oJgsHQ2ohR8a0eIWzIxLVbfnCnl2tLh8rn/9M= github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM= diff --git a/hibernation/reconciler.go b/hibernation/reconciler.go index 0c5f05c..c681fbc 100644 --- a/hibernation/reconciler.go +++ b/hibernation/reconciler.go @@ -175,11 +175,11 @@ func (r *Reconciler) hibernationsTargetingPod(ctx context.Context, obj client.Ob return out } -// setPhase patches status, preserving timestamps on no-op updates. -// On Failed->Waking re-entry, it refreshes LastTransitionTime so the wake deadline -// does not inherit the stale timestamp from the previous failure. +// setPhase patches status when phase, vmName, or generation moved. On +// Failed->Waking re-entry it also refreshes Ready.LastTransitionTime so the +// wake deadline doesn't carry over from the previous failure. func (r *Reconciler) setPhase(ctx context.Context, hib *cocoonv1.CocoonHibernation, phase cocoonv1.CocoonHibernationPhase, vmName string) error { - if hib.Status.Phase == phase && hib.Status.VMName == vmName { + if hib.Status.Phase == phase && hib.Status.VMName == vmName && hib.Status.ObservedGeneration == hib.Generation { return nil } refreshWakeDeadline := phase == cocoonv1.CocoonHibernationPhaseWaking && @@ -214,13 +214,11 @@ func (r *Reconciler) markFailed(ctx context.Context, hib *cocoonv1.CocoonHiberna return nil } -// markPending records that the CR is waiting on external state (pod creation, -// VMName annotation) without pinning the phase to Failed. Self-heals once the -// pod watcher re-enqueues the CR. Short-circuits when the phase and Ready -// condition message already match, so the high-volume pod watcher does not -// generate a PATCH on every event. +// markPending parks the CR on Pending without pinning Failed; self-heals on +// re-enqueue. Short-circuits when phase, generation, and Ready message all +// match so the pod watcher doesn't PATCH on every event. func (r *Reconciler) markPending(ctx context.Context, hib *cocoonv1.CocoonHibernation, msg string) error { - if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending { + if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending && hib.Status.ObservedGeneration == hib.Generation { if ready := apimeta.FindStatusCondition(hib.Status.Conditions, commonk8s.ConditionTypeReady); ready != nil && ready.Message == msg { return nil } diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index c0f1187..ee56046 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -463,6 +463,72 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) { } } +func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) { + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 7}, + Status: cocoonv1.CocoonHibernationStatus{ + Phase: cocoonv1.CocoonHibernationPhaseHibernated, + VMName: "vk-ns-demo-0", + ObservedGeneration: 6, // lags by one rev + }, + } + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hib). + WithStatusSubresource(&cocoonv1.CocoonHibernation{}). + Build() + r := &Reconciler{Client: cli, Scheme: scheme} + + if err := r.setPhase(t.Context(), hib, cocoonv1.CocoonHibernationPhaseHibernated, "vk-ns-demo-0"); err != nil { + t.Fatalf("setPhase: %v", err) + } + + var out cocoonv1.CocoonHibernation + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil { + t.Fatalf("get hib: %v", err) + } + if out.Status.ObservedGeneration != 7 { + t.Errorf("ObservedGeneration = %d, want 7 (unchanged-phase reconcile must still surface generation bump)", out.Status.ObservedGeneration) + } +} + +func TestMarkPendingPatchesObservedGenerationOnSameMessage(t *testing.T) { + msg := "pod ns/demo-0 not yet present" + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 4}, + Status: cocoonv1.CocoonHibernationStatus{ + Phase: cocoonv1.CocoonHibernationPhasePending, + ObservedGeneration: 3, // lags by one rev + Conditions: []metav1.Condition{{ + Type: commonk8s.ConditionTypeReady, + Status: metav1.ConditionFalse, + Reason: conditionReasonPending, + Message: msg, + }}, + }, + } + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hib). + WithStatusSubresource(&cocoonv1.CocoonHibernation{}). + Build() + r := &Reconciler{Client: cli, Scheme: scheme} + + if err := r.markPending(t.Context(), hib, msg); err != nil { + t.Fatalf("markPending: %v", err) + } + + var out cocoonv1.CocoonHibernation + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil { + t.Fatalf("get hib: %v", err) + } + if out.Status.ObservedGeneration != 4 { + t.Errorf("ObservedGeneration = %d, want 4 (markPending must patch when generation moved even if message is unchanged)", out.Status.ObservedGeneration) + } +} + func TestReconcilePendingWhenPodMissing(t *testing.T) { hib := &cocoonv1.CocoonHibernation{ ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}}, diff --git a/main.go b/main.go index ad4b4f5..a22e5fc 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,7 @@ package main import ( "context" "flag" + "fmt" "os" "os/signal" "syscall" @@ -48,7 +49,10 @@ func main() { flag.Parse() ctx := context.Background() - commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL") + if err := commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL"); err != nil { + fmt.Fprintf(os.Stderr, "setup log: %v\n", err) + os.Exit(1) + } // Silence controller-runtime's own logger; we use core/log instead. crlog.SetLogger(logr.Discard()) logger := log.WithFunc("main")