Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,6 @@ linters:
- vendor
- mocks
- dist
rules:
- path: _test\.go
linters:
- errcheck
- gosec
- unparam
- goconst
- gocyclo

formatters:
enable:
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ cocoon-operator/
├── main.go # manager wiring + flag parsing
├── cocoonset/ # CocoonSet reconciler, pod builders, status diff
├── hibernation/ # CocoonHibernation reconciler
└── epoch/ # SnapshotRegistry interface + epoch HTTP adapter
└── snapshot/ # snapshot.Registry interface consumed by both reconcilers
```

## Architecture
Expand All @@ -32,11 +32,11 @@ cocoon-operator/
│ │ │ │
│ ▼ ▼ │
│ ┌────────────────────┐ ┌──────────────────────┐ │
│ │ controller-runtime │ │ epoch SnapshotRegistry│ │
│ │ Manager │ │ (HTTP via │ │
│ │ - leader election │ │ registryclient) │ │
│ │ - metrics :8080 └──────────────────────┘ │
│ │ - probes :8081
│ │ controller-runtime │ │ snapshot.Registry │ │
│ │ Manager │ │ (HTTP via │ │
│ │ - leader election │ │ registryclient) │ │
│ │ - metrics :8080 └──────────────────────┘ │
│ │ - probes :8081
│ └────────────────────┘ │
└──────────────────────────────────────────────────────────────────┘
```
Expand Down Expand Up @@ -73,7 +73,7 @@ There is no `cocoon-vm-snapshots` ConfigMap bridge — epoch is the single sourc
| `OPERATOR_LOG_LEVEL` | `info` | `projecteru2/core/log` level |
| `EPOCH_URL` | `http://epoch.cocoon-system.svc:8080` | Base URL of the epoch registry |
| `EPOCH_TOKEN` | unset | Bearer token (read-only is enough) |
| `EPOCH_CA_CERT` | unset | Path to PEM-encoded CA certificate for TLS verification against epoch |
| `EPOCH_CA_CERT` | unset | Read internally by `github.com/cocoonstack/epoch/registryclient` — see that package for the exact env names. |
| `METRICS_ADDR` | `:8080` | Prometheus listener |
| `PROBE_ADDR` | `:8081` | healthz / readyz listener |
| `LEADER_ELECT` | `true` | Enable leader election so only one replica reconciles |
Expand Down Expand Up @@ -125,7 +125,7 @@ The Makefile detects Go workspace mode (`go env GOWORK`) and skips `go mod tidy`
|---|---|
| [cocoon-common](https://github.com/cocoonstack/cocoon-common) | CRD types, annotation contract, shared helpers |
| [cocoon-webhook](https://github.com/cocoonstack/cocoon-webhook) | Admission webhook for sticky scheduling and CocoonSet validation |
| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via `SnapshotRegistry` |
| [epoch](https://github.com/cocoonstack/epoch) | Snapshot registry; the operator queries it via the `snapshot.Registry` interface |
| [vk-cocoon](https://github.com/cocoonstack/vk-cocoon) | Virtual kubelet provider managing VM lifecycle |

## License
Expand Down
12 changes: 6 additions & 6 deletions cocoonset/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,8 @@ import (
"github.com/cocoonstack/cocoon-common/meta"
)

// annotationDeleteVMNames durably records VM names for the GC step that runs
// after pods are gone, so a CocoonSet deleted before Status.Agents was patched
// (race window between r.Create(pod) and the reconcile that fills status) still
// gets every tag cleaned. Annotation, not status, because it must survive the
// pod-still-terminating requeue without depending on a second-pass status write.
// annotationDeleteVMNames records VM names for the post-pod GC step, so a
// CocoonSet deleted before Status.Agents was patched still gets every tag cleaned.
const annotationDeleteVMNames = "cocoonset.cocoonstack.io/delete-vm-names"

// reconcileDelete deletes all owned pods, GCs snapshots, and removes the finalizer.
Expand All @@ -44,7 +41,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet
return ctrl.Result{}, fmt.Errorf("stash vm names: %w", err)
}

// Phase 1: delete all pods and let vk-cocoon finish snapshot push.
// vk-cocoon completes the snapshot push during the grace period before GC.
for i := range owned {
if ctxErr := ctx.Err(); ctxErr != nil {
return ctrl.Result{}, ctxErr
Expand Down Expand Up @@ -80,6 +77,9 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cs *cocoonv1.CocoonSet
}
}
}
} else {
// Production always wires r.Epoch; this branch only fires in tests or misconfig.
logger.Warnf(ctx, "skipping epoch tag GC for cocoonset %s/%s: registry not configured", cs.Namespace, cs.Name)
}

if controllerutil.ContainsFinalizer(cs, finalizerName) {
Expand Down
7 changes: 3 additions & 4 deletions cocoonset/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ func newManagedPod(cs *cocoonv1.CocoonSet, podName, role, slotLabel string, sche
// buildAgentPod would produce from the current CocoonSet spec.
func podSpecMatchesAgent(pod *corev1.Pod, cs *cocoonv1.CocoonSet, slot int32) bool {
current := meta.ParseVMSpec(pod)
// Sub-agents inherit ForkFrom; main agents leave it empty so a manual edit drifts.
forkFrom := ""
if slot > 0 {
forkFrom = current.ForkFrom
Expand Down Expand Up @@ -206,10 +207,8 @@ func podSpecMatchesToolbox(pod *corev1.Pod, cs *cocoonv1.CocoonSet, tb cocoonv1.
return true
}

// vmSpecMatches reports whether two VMSpecs are equivalent. Callers
// construct want with VMName/ForkFrom copied from current so those
// fields never trigger a spurious mismatch; struct equality lets any
// future VMSpec field be covered without editing this function.
// vmSpecMatches uses struct equality so any future VMSpec field is covered.
// Callers copy VMName/ForkFrom from current into want to avoid spurious mismatches.
func vmSpecMatches(got, want meta.VMSpec) bool {
return got == want
}
Expand Down
23 changes: 23 additions & 0 deletions cocoonset/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,29 @@ func TestEnsureToolboxesCollisionReturnsError(t *testing.T) {
}
}

func TestEnsureToolboxesRejectsDuplicateNames(t *testing.T) {
scheme := testScheme(t)
cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) {
cs.Spec.Toolboxes = []cocoonv1.ToolboxSpec{
{Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:latest"},
{Name: "tb", Image: "ghcr.io/cocoonstack/cocoon/toolbox:other"},
}
})

cli := ctrlfake.NewClientBuilder().WithScheme(scheme).Build()
r := &Reconciler{Client: cli, Scheme: scheme}
classified := classifiedPods{
sub: map[int32]*corev1.Pod{},
toolbox: map[string]*corev1.Pod{},
allByName: map[string]*corev1.Pod{},
}

_, err := r.ensureToolboxes(t.Context(), cs, classified)
if err == nil {
t.Fatal("ensureToolboxes must reject a spec with duplicate toolbox names")
}
}

func TestEnsureToolboxesIdempotentOnExistingToolbox(t *testing.T) {
scheme := testScheme(t)
cs := newCocoonSet("demo", func(cs *cocoonv1.CocoonSet) {
Expand Down
11 changes: 9 additions & 2 deletions cocoonset/toolboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,17 @@ import (
// Returns true when cluster state was mutated.
func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet, classified classifiedPods) (bool, error) {
logger := log.WithFunc("cocoonset.Reconciler.ensureToolboxes")
changed := false
desired := map[string]bool{}
// Defense in depth: webhook should already reject duplicates. Validate
// upfront before any Create/Delete so a bypass can't leave partial state.
desired := make(map[string]bool, len(cs.Spec.Toolboxes))
for _, tb := range cs.Spec.Toolboxes {
if desired[tb.Name] {
return false, fmt.Errorf("duplicate toolbox name %q in spec", tb.Name)
}
desired[tb.Name] = true
Comment thread
CMGS marked this conversation as resolved.
}
changed := false
for _, tb := range cs.Spec.Toolboxes {
podName := fmt.Sprintf("%s-%s", cs.Name, tb.Name)
if classified.allByName[podName] != nil && classified.toolbox[tb.Name] == nil {
return changed, fmt.Errorf("create toolbox %s: name collision with existing pod %s", tb.Name, podName)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/cocoonstack/cocoon-operator
go 1.25.6

require (
github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381
github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373
github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f
github.com/go-logr/logr v1.4.3
github.com/projecteru2/core v0.0.0-20241016125006-ff909eefe04c
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ=
github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381 h1:YBouruVbi21o/oYzD3EzEA4g68hmpkLsQ9fgE9tDnaM=
github.com/cocoonstack/cocoon-common v0.2.1-0.20260510051935-9af438194381/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y=
github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373 h1:8xkf1dt/2DiJv0o/7aS9nYY4wJhDUNAggYeA4B0+rs8=
github.com/cocoonstack/cocoon-common v0.2.1-0.20260513092659-fbadeee69373/go.mod h1:xIXbJ83vngQ2mrLC6q0Tw7h21M9BYBBqqYTcHaUrm1Y=
github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f h1:ktTB8oZKIRRUykSnlP7dGDRNaP/GBWoeV4Sk5JQAG6Q=
github.com/cocoonstack/epoch v0.2.3-0.20260506150956-5d672b90749f/go.mod h1:n8RJp/oJgsHQ2ohR8a0eIWzIxLVbfnCnl2tLh8rn/9M=
github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM=
Expand Down
18 changes: 8 additions & 10 deletions hibernation/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,11 @@ func (r *Reconciler) hibernationsTargetingPod(ctx context.Context, obj client.Ob
return out
}

// setPhase patches status, preserving timestamps on no-op updates.
// On Failed->Waking re-entry, it refreshes LastTransitionTime so the wake deadline
// does not inherit the stale timestamp from the previous failure.
// setPhase patches status when phase, vmName, or generation moved. On
// Failed->Waking re-entry it also refreshes Ready.LastTransitionTime so the
// wake deadline doesn't carry over from the previous failure.
func (r *Reconciler) setPhase(ctx context.Context, hib *cocoonv1.CocoonHibernation, phase cocoonv1.CocoonHibernationPhase, vmName string) error {
if hib.Status.Phase == phase && hib.Status.VMName == vmName {
if hib.Status.Phase == phase && hib.Status.VMName == vmName && hib.Status.ObservedGeneration == hib.Generation {
return nil
}
refreshWakeDeadline := phase == cocoonv1.CocoonHibernationPhaseWaking &&
Expand Down Expand Up @@ -214,13 +214,11 @@ func (r *Reconciler) markFailed(ctx context.Context, hib *cocoonv1.CocoonHiberna
return nil
}

// markPending records that the CR is waiting on external state (pod creation,
// VMName annotation) without pinning the phase to Failed. Self-heals once the
// pod watcher re-enqueues the CR. Short-circuits when the phase and Ready
// condition message already match, so the high-volume pod watcher does not
// generate a PATCH on every event.
// markPending parks the CR on Pending without pinning Failed; self-heals on
// re-enqueue. Short-circuits when phase, generation, and Ready message all
// match so the pod watcher doesn't PATCH on every event.
func (r *Reconciler) markPending(ctx context.Context, hib *cocoonv1.CocoonHibernation, msg string) error {
if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending {
if hib.Status.Phase == cocoonv1.CocoonHibernationPhasePending && hib.Status.ObservedGeneration == hib.Generation {
if ready := apimeta.FindStatusCondition(hib.Status.Conditions, commonk8s.ConditionTypeReady); ready != nil && ready.Message == msg {
return nil
}
Expand Down
66 changes: 66 additions & 0 deletions hibernation/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,72 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) {
}
}

func TestSetPhasePatchesObservedGenerationOnSamePhase(t *testing.T) {
hib := &cocoonv1.CocoonHibernation{
ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 7},
Status: cocoonv1.CocoonHibernationStatus{
Phase: cocoonv1.CocoonHibernationPhaseHibernated,
VMName: "vk-ns-demo-0",
ObservedGeneration: 6, // lags by one rev
},
}
scheme := testScheme(t)
cli := ctrlfake.NewClientBuilder().
WithScheme(scheme).
WithObjects(hib).
WithStatusSubresource(&cocoonv1.CocoonHibernation{}).
Build()
r := &Reconciler{Client: cli, Scheme: scheme}

if err := r.setPhase(t.Context(), hib, cocoonv1.CocoonHibernationPhaseHibernated, "vk-ns-demo-0"); err != nil {
t.Fatalf("setPhase: %v", err)
}

var out cocoonv1.CocoonHibernation
if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil {
t.Fatalf("get hib: %v", err)
}
if out.Status.ObservedGeneration != 7 {
t.Errorf("ObservedGeneration = %d, want 7 (unchanged-phase reconcile must still surface generation bump)", out.Status.ObservedGeneration)
}
}

func TestMarkPendingPatchesObservedGenerationOnSameMessage(t *testing.T) {
msg := "pod ns/demo-0 not yet present"
hib := &cocoonv1.CocoonHibernation{
ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Generation: 4},
Status: cocoonv1.CocoonHibernationStatus{
Phase: cocoonv1.CocoonHibernationPhasePending,
ObservedGeneration: 3, // lags by one rev
Conditions: []metav1.Condition{{
Type: commonk8s.ConditionTypeReady,
Status: metav1.ConditionFalse,
Reason: conditionReasonPending,
Message: msg,
}},
},
}
scheme := testScheme(t)
cli := ctrlfake.NewClientBuilder().
WithScheme(scheme).
WithObjects(hib).
WithStatusSubresource(&cocoonv1.CocoonHibernation{}).
Build()
r := &Reconciler{Client: cli, Scheme: scheme}

if err := r.markPending(t.Context(), hib, msg); err != nil {
t.Fatalf("markPending: %v", err)
}

var out cocoonv1.CocoonHibernation
if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &out); err != nil {
t.Fatalf("get hib: %v", err)
}
if out.Status.ObservedGeneration != 4 {
t.Errorf("ObservedGeneration = %d, want 4 (markPending must patch when generation moved even if message is unchanged)", out.Status.ObservedGeneration)
}
}

func TestReconcilePendingWhenPodMissing(t *testing.T) {
hib := &cocoonv1.CocoonHibernation{
ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}},
Expand Down
6 changes: 5 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main
import (
"context"
"flag"
"fmt"
"os"
"os/signal"
"syscall"
Expand Down Expand Up @@ -48,7 +49,10 @@ func main() {
flag.Parse()

ctx := context.Background()
commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL")
if err := commonlog.Setup(ctx, "OPERATOR_LOG_LEVEL"); err != nil {
fmt.Fprintf(os.Stderr, "setup log: %v\n", err)
os.Exit(1)
}
// Silence controller-runtime's own logger; we use core/log instead.
crlog.SetLogger(logr.Discard())
logger := log.WithFunc("main")
Expand Down