From 0662629f3373ccba0aa66771a6261db86dcf85c2 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:36:52 +0800 Subject: [PATCH 01/20] fix(auth): enforce session expiry in VerifySession VerifySession parsed Exp but never validated it, leaving caller correctness as the only gate against expired cookies. Reject sessions where Exp is set and in the past; Exp == 0 still means "no expiry" so callers that opt out of expiration keep working. Downstream consumers in epoch and glance already perform a redundant Exp check at the call site; those will be cleaned up in coordinated PRs. --- auth/session.go | 8 +++++++- auth/session_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/auth/session.go b/auth/session.go index 89da380..9da62cb 100644 --- a/auth/session.go +++ b/auth/session.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "strings" + "time" ) // Session holds the claims embedded in an HMAC-signed cookie. @@ -32,7 +33,9 @@ func SignSession(sess Session, key []byte) (string, error) { } // VerifySession validates the HMAC signature and decodes the session. -// Returns nil and false if the signature is invalid or decoding fails. +// Returns nil and false if the signature is invalid, decoding fails, or +// the session has expired (Exp set and in the past). Exp == 0 means +// "no expiry" and is accepted unconditionally. func VerifySession(cookie string, key []byte) (*Session, bool) { payload, sig, ok := strings.Cut(cookie, ".") if !ok { @@ -52,5 +55,8 @@ func VerifySession(cookie string, key []byte) (*Session, bool) { if json.Unmarshal(data, sess) != nil { return nil, false } + if sess.Exp != 0 && sess.Exp < time.Now().Unix() { + return nil, false + } return sess, true } diff --git a/auth/session_test.go b/auth/session_test.go index d809651..a2777ba 100644 --- a/auth/session_test.go +++ b/auth/session_test.go @@ -60,6 +60,34 @@ func TestVerifySessionWrongKey(t *testing.T) { } } +func TestVerifySessionRejectsExpired(t *testing.T) { + t.Parallel() + + key := []byte("test-secret-key-32-bytes-long!!!") + expired := Session{User: "dave", Exp: time.Now().Add(-time.Hour).Unix()} + cookie, err := SignSession(expired, key) + if err != nil { + t.Fatalf("SignSession: %v", err) + } + if _, ok := VerifySession(cookie, key); ok { + t.Error("expected expired cookie to fail") + } +} + +func TestVerifySessionZeroExpAccepted(t *testing.T) { + t.Parallel() + + key := []byte("test-secret-key-32-bytes-long!!!") + // Exp == 0 means "no expiry" — must remain accepted. + cookie, err := SignSession(Session{User: "eve"}, key) + if err != nil { + t.Fatalf("SignSession: %v", err) + } + if _, ok := VerifySession(cookie, key); !ok { + t.Error("expected session with zero Exp to be accepted") + } +} + func TestRandomState(t *testing.T) { t.Parallel() From f97272eebf4fb7e3267d0fdb9d694da2946c2da8 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:37:15 +0800 Subject: [PATCH 02/20] fix(auth): panic on crypto/rand failure in RandomState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swallowing rand.Read's error meant a degraded OS RNG would silently return all-zero state, defeating CSRF protection. Panic instead — the syscall is documented to never fail on supported platforms, and if it ever does, halting is the only safe option. --- auth/state.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/auth/state.go b/auth/state.go index ca27304..cee8ae6 100644 --- a/auth/state.go +++ b/auth/state.go @@ -3,12 +3,20 @@ package auth import ( "crypto/rand" "encoding/hex" + "fmt" ) // RandomState returns a cryptographically random 32-character hex string // suitable for OAuth state parameters and CSRF nonces. +// +// Panics if crypto/rand.Read fails. On supported platforms the syscall +// is documented to never fail; a failure indicates the OS RNG is +// unavailable, in which case continuing with a weak nonce would silently +// compromise CSRF protection. func RandomState() string { b := make([]byte, 16) - _, _ = rand.Read(b) //nolint:errcheck // crypto/rand.Read never fails on supported platforms + if _, err := rand.Read(b); err != nil { + panic(fmt.Sprintf("crypto/rand.Read: %v", err)) + } return hex.EncodeToString(b) } From 11b749927f871673819f54bf04e36106361f8f70 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:39:10 +0800 Subject: [PATCH 03/20] fix(k8s): fall back to self-signed when on-disk cert expired MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadOrGenerateCert previously accepted any keypair that parsed, so a kubelet whose mounted secret rotated out from under it would silently keep serving an expired cert until the API server refused TLS — a diagnosis-hostile failure mode. Parse the leaf cert after a successful keypair load; if NotAfter is in the past, log a warning and mint a self-signed cert instead. Parse errors on the cert chain are logged but non-fatal so a working keypair is never gratuitously discarded. Keypair load failures (corrupt PEM, mismatched key) continue to surface as fatal — those signal operator misconfiguration and silent substitution would hide them. --- k8s/tls.go | 68 ++++++++++++++++++++++++++++++++++++++++++------- k8s/tls_test.go | 43 +++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 9 deletions(-) diff --git a/k8s/tls.go b/k8s/tls.go index 8ca523f..3d9be4b 100644 --- a/k8s/tls.go +++ b/k8s/tls.go @@ -1,6 +1,7 @@ package k8s import ( + "context" "crypto/ecdsa" "crypto/elliptic" "crypto/rand" @@ -13,29 +14,78 @@ import ( "net" "os" "time" + + "github.com/projecteru2/core/log" ) const localhost = "127.0.0.1" // LoadOrGenerateCert loads a TLS keypair from disk, falling back to a self-signed cert. // Returns a source label for logging ("disk " or "self-signed"). +// +// If the on-disk cert is already expired, a warning is logged and the +// function falls through to mint a self-signed cert — kubelets that +// keep running with a stale cert appear healthy but get rejected by the +// API server with an opaque TLS error. func LoadOrGenerateCert(certPath, keyPath, hostname, ip string) (tls.Certificate, string, error) { - if certPath != "" && keyPath != "" { - if _, err := os.Stat(certPath); err == nil { - cert, err := tls.LoadX509KeyPair(certPath, keyPath) - if err != nil { - return tls.Certificate{}, "", fmt.Errorf("load tls keypair %s: %w", certPath, err) - } - return cert, fmt.Sprintf("disk %s", certPath), nil - } + cert, source, err := tryLoadDiskCert(certPath, keyPath) + if err != nil { + return tls.Certificate{}, "", err } - cert, err := GenerateSelfSignedCert(hostname, ip) + if source != "" { + return cert, source, nil + } + cert, err = GenerateSelfSignedCert(hostname, ip) if err != nil { return tls.Certificate{}, "", fmt.Errorf("generate self-signed cert: %w", err) } return cert, "self-signed", nil } +// tryLoadDiskCert attempts to load the keypair at the configured paths. +// Returns ("", "", nil) when the caller should fall back to a self-signed +// cert (paths empty, cert missing, or cert expired) and propagates a +// non-nil error only when the keypair load itself fails — that signals +// operator misconfiguration which silently substituting a self-signed +// cert would mask. +func tryLoadDiskCert(certPath, keyPath string) (tls.Certificate, string, error) { + if certPath == "" || keyPath == "" { + return tls.Certificate{}, "", nil + } + if _, err := os.Stat(certPath); err != nil { + return tls.Certificate{}, "", nil + } + cert, err := tls.LoadX509KeyPair(certPath, keyPath) + if err != nil { + return tls.Certificate{}, "", fmt.Errorf("load tls keypair %s: %w", certPath, err) + } + if isCertExpired(cert, certPath) { + return tls.Certificate{}, "", nil + } + return cert, fmt.Sprintf("disk %s", certPath), nil +} + +// isCertExpired returns true when the on-disk leaf cert is past its +// NotAfter. Parse failures are logged as warnings (not fatal) and +// treated as "not expired" so a load that succeeded at the tls layer +// is not gratuitously discarded. +func isCertExpired(cert tls.Certificate, certPath string) bool { + logger := log.WithFunc("k8s.LoadOrGenerateCert") + if len(cert.Certificate) == 0 { + return false + } + parsed, err := x509.ParseCertificate(cert.Certificate[0]) + if err != nil { + logger.Warnf(context.Background(), "parse disk cert %s: %v (keeping cert)", certPath, err) + return false + } + if time.Now().After(parsed.NotAfter) { + logger.Warnf(context.Background(), "disk cert %s expired at %s, falling back to self-signed", certPath, parsed.NotAfter.Format(time.RFC3339)) + return true + } + return false +} + // GenerateSelfSignedCert creates an in-memory ECDSA P-256 self-signed cert for hostname and ip. func GenerateSelfSignedCert(hostname, ip string) (tls.Certificate, error) { priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) diff --git a/k8s/tls_test.go b/k8s/tls_test.go index f534680..dd1f893 100644 --- a/k8s/tls_test.go +++ b/k8s/tls_test.go @@ -100,6 +100,49 @@ func TestLoadOrGenerateCertLoadsFromDisk(t *testing.T) { } } +func TestLoadOrGenerateCertExpiredDiskCertFallsBack(t *testing.T) { + dir := t.TempDir() + certPath := filepath.Join(dir, "tls.crt") + keyPath := filepath.Join(dir, "tls.key") + + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("gen key: %v", err) + } + // NotAfter in the past forces the expiry branch. + tmpl := x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "host"}, + NotBefore: time.Now().Add(-2 * time.Hour), + NotAfter: time.Now().Add(-time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + IPAddresses: []net.IP{net.ParseIP("10.0.0.1")}, + } + certDER, err := x509.CreateCertificate(rand.Reader, &tmpl, &tmpl, &priv.PublicKey, priv) + if err != nil { + t.Fatalf("create cert: %v", err) + } + privDER, err := x509.MarshalECPrivateKey(priv) + if err != nil { + t.Fatalf("marshal key: %v", err) + } + if err := os.WriteFile(certPath, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: certDER}), 0o600); err != nil { + t.Fatalf("write cert: %v", err) + } + if err := os.WriteFile(keyPath, pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: privDER}), 0o600); err != nil { + t.Fatalf("write key: %v", err) + } + + _, source, err := LoadOrGenerateCert(certPath, keyPath, "host", "10.0.0.1") + if err != nil { + t.Fatalf("load: %v", err) + } + if source != "self-signed" { + t.Errorf("expected expired disk cert to fall back to self-signed, got source = %q", source) + } +} + func TestDetectNodeIPReturnsSomething(t *testing.T) { if got := DetectNodeIP(); got == "" { t.Errorf("DetectNodeIP returned empty string") From 2353d973286d419ff39feb1364b9d8646f40e1f0 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:40:05 +0800 Subject: [PATCH 04/20] feat(cocoonset): mark Agents/Toolboxes as listType=map Status arrays without an x-kubernetes-list-type were treated as atomic by Server-Side Apply, so two controllers (e.g. the operator and vk) patching different slots would clobber each other on every write. Tag Agents with listMapKey=slot and Toolboxes with listMapKey=name so apiserver merges entries by their natural identifier. Regenerate the CRD YAML. --- apis/v1/cocoonset_types.go | 4 ++++ apis/v1/crds/cocoonset.cocoonstack.io_cocoonsets.yaml | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/apis/v1/cocoonset_types.go b/apis/v1/cocoonset_types.go index eaf9503..48709ff 100644 --- a/apis/v1/cocoonset_types.go +++ b/apis/v1/cocoonset_types.go @@ -142,9 +142,13 @@ type CocoonSetStatus struct { DesiredToolboxes int32 `json:"desiredToolboxes"` // +optional + // +listType=map + // +listMapKey=slot Agents []AgentStatus `json:"agents,omitempty"` // +optional + // +listType=map + // +listMapKey=name Toolboxes []ToolboxStatus `json:"toolboxes,omitempty"` // +optional diff --git a/apis/v1/crds/cocoonset.cocoonstack.io_cocoonsets.yaml b/apis/v1/crds/cocoonset.cocoonstack.io_cocoonsets.yaml index 544c03a..037fba6 100644 --- a/apis/v1/crds/cocoonset.cocoonstack.io_cocoonsets.yaml +++ b/apis/v1/crds/cocoonset.cocoonstack.io_cocoonsets.yaml @@ -443,6 +443,9 @@ spec: - slot type: object type: array + x-kubernetes-list-map-keys: + - slot + x-kubernetes-list-type: map conditions: items: description: Condition contains details for one aspect of the current @@ -562,6 +565,9 @@ spec: - name type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map type: object type: object served: true From b055bdb9278d09d2d64b866e683c6c853cbd5ee1 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:40:42 +0800 Subject: [PATCH 05/20] refactor(meta): table-driven LifecycleState.IsTerminal Move the const block above the type declaration to match the rest of the package, and back IsTerminal with a package-private terminalStates map so adding or removing a terminal state is a single-line edit instead of a switch-case maintenance burden. --- meta/lifecycle.go | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/meta/lifecycle.go b/meta/lifecycle.go index 2865568..08fe85e 100644 --- a/meta/lifecycle.go +++ b/meta/lifecycle.go @@ -6,10 +6,6 @@ import ( corev1 "k8s.io/api/core/v1" ) -// LifecycleState is the typed contract for the lifecycle-state annotation -// vk-cocoon publishes on a Pod. -type LifecycleState string - const ( LifecycleStateCreating LifecycleState = "creating" LifecycleStateReady LifecycleState = "ready" @@ -18,13 +14,22 @@ const ( LifecycleStateFailed LifecycleState = "failed" ) +// terminalStates is the lookup set IsTerminal consults — keep it in sync +// with the const block above. +var terminalStates = map[LifecycleState]struct{}{ + LifecycleStateReady: {}, + LifecycleStateHibernated: {}, + LifecycleStateFailed: {}, +} + +// LifecycleState is the typed contract for the lifecycle-state annotation +// vk-cocoon publishes on a Pod. +type LifecycleState string + // IsTerminal reports whether s is a state a client would wait for. func (s LifecycleState) IsTerminal() bool { - switch s { - case LifecycleStateReady, LifecycleStateHibernated, LifecycleStateFailed: - return true - } - return false + _, ok := terminalStates[s] + return ok } // LifecycleStatus is the full triple (state, observed-generation, message). From d9156b8666322b6cd765652141bfad8a814bbbd0 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:41:34 +0800 Subject: [PATCH 06/20] refactor(apis): table-driven enum IsValid + cmp.Or-based Default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace each per-type Default's if-empty-return ladder with cmp.Or, and each IsValid's hand-rolled OR chain with slices.Contains against a package-private value table. Adding a new enum member is now a single edit to the const block plus one entry in the validity table — no parallel switch to keep in sync. Behavior is unchanged: all existing IsValid/Default tests pass without modification. --- apis/v1/enums.go | 79 +++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 51 deletions(-) diff --git a/apis/v1/enums.go b/apis/v1/enums.go index 9bbfe1e..5793d5a 100644 --- a/apis/v1/enums.go +++ b/apis/v1/enums.go @@ -1,5 +1,10 @@ package v1 +import ( + "cmp" + "slices" +) + const ( AgentModeClone AgentMode = "clone" AgentModeRun AgentMode = "run" @@ -32,6 +37,17 @@ const ( BackendFirecracker Backend = "firecracker" ) +// Per-type valid-value tables. Keep ordering aligned with the const +// block above so a new enum member is one edit in each place. +var ( + agentModeValid = []AgentMode{AgentModeClone, AgentModeRun} + toolboxModeValid = []ToolboxMode{ToolboxModeRun, ToolboxModeClone, ToolboxModeStatic} + osTypeValid = []OSType{OSLinux, OSWindows, OSAndroid} + snapshotPolicyValid = []SnapshotPolicy{SnapshotPolicyAlways, SnapshotPolicyMainOnly, SnapshotPolicyNever} + connTypeValid = []ConnType{ConnTypeSSH, ConnTypeRDP, ConnTypeVNC, ConnTypeADB} + backendValid = []Backend{BackendCloudHypervisor, BackendFirecracker} +) + // AgentMode defines the mode of an agent VM. // +kubebuilder:validation:Enum=clone;run type AgentMode string @@ -66,78 +82,39 @@ type ConnType string type Backend string // IsValid reports whether m is a recognized AgentMode value. -func (m AgentMode) IsValid() bool { - return m == AgentModeClone || m == AgentModeRun -} +func (m AgentMode) IsValid() bool { return slices.Contains(agentModeValid, m) } // Default returns m when set, otherwise AgentModeClone. -func (m AgentMode) Default() AgentMode { - if m == "" { - return AgentModeClone - } - return m -} +func (m AgentMode) Default() AgentMode { return cmp.Or(m, AgentModeClone) } // IsValid reports whether m is a recognized ToolboxMode value. -func (m ToolboxMode) IsValid() bool { - return m == ToolboxModeRun || m == ToolboxModeClone || m == ToolboxModeStatic -} +func (m ToolboxMode) IsValid() bool { return slices.Contains(toolboxModeValid, m) } // Default returns m when set, otherwise ToolboxModeRun. -func (m ToolboxMode) Default() ToolboxMode { - if m == "" { - return ToolboxModeRun - } - return m -} +func (m ToolboxMode) Default() ToolboxMode { return cmp.Or(m, ToolboxModeRun) } // IsValid reports whether o is a recognized OSType value. -func (o OSType) IsValid() bool { - return o == OSLinux || o == OSWindows || o == OSAndroid -} +func (o OSType) IsValid() bool { return slices.Contains(osTypeValid, o) } // Default returns o when set, otherwise OSLinux. -func (o OSType) Default() OSType { - if o == "" { - return OSLinux - } - return o -} +func (o OSType) Default() OSType { return cmp.Or(o, OSLinux) } // IsValid reports whether p is a recognized SnapshotPolicy value. -func (p SnapshotPolicy) IsValid() bool { - return p == SnapshotPolicyAlways || p == SnapshotPolicyMainOnly || p == SnapshotPolicyNever -} +func (p SnapshotPolicy) IsValid() bool { return slices.Contains(snapshotPolicyValid, p) } // Default returns p when set, otherwise SnapshotPolicyAlways. -func (p SnapshotPolicy) Default() SnapshotPolicy { - if p == "" { - return SnapshotPolicyAlways - } - return p -} +func (p SnapshotPolicy) Default() SnapshotPolicy { return cmp.Or(p, SnapshotPolicyAlways) } // IsValid reports whether c is a recognized ConnType value. -func (c ConnType) IsValid() bool { - return c == ConnTypeSSH || c == ConnTypeRDP || c == ConnTypeVNC || c == ConnTypeADB -} +func (c ConnType) IsValid() bool { return slices.Contains(connTypeValid, c) } // Default returns c unchanged. Unlike the other enums, ConnType has no // static default: an empty value signals "infer from OS and runtime" // (see meta.ConnectionType), so this method exists only for API symmetry. -func (c ConnType) Default() ConnType { - return c -} +func (c ConnType) Default() ConnType { return c } // IsValid reports whether b is a recognized Backend value. -func (b Backend) IsValid() bool { - return b == BackendCloudHypervisor || b == BackendFirecracker -} +func (b Backend) IsValid() bool { return slices.Contains(backendValid, b) } // Default returns b when set, otherwise BackendCloudHypervisor. -func (b Backend) Default() Backend { - if b == "" { - return BackendCloudHypervisor - } - return b -} +func (b Backend) Default() Backend { return cmp.Or(b, BackendCloudHypervisor) } From c2e70e051ca91e82c7a42076bc6f31732eec73c1 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:43:32 +0800 Subject: [PATCH 07/20] refactor(meta): split meta.go by concern meta.go was a 160-line grab-bag of every constant and helper in the package. Split by concern so each file matches one responsibility: keys.go const vocabulary (annotations, labels, roles) owner.go toleration + owner-reference helpers vmname.go VM name builders + slot extraction connection.go connection-protocol inference meta.go keeps only the package godoc. No exported symbol changes; godoc strings on each annotation/label const are new. --- meta/connection.go | 24 +++++++ meta/keys.go | 88 +++++++++++++++++++++++++ meta/meta.go | 161 ++------------------------------------------- meta/owner.go | 35 ++++++++++ meta/vmname.go | 55 ++++++++++++++++ 5 files changed, 208 insertions(+), 155 deletions(-) create mode 100644 meta/connection.go create mode 100644 meta/keys.go create mode 100644 meta/owner.go create mode 100644 meta/vmname.go diff --git a/meta/connection.go b/meta/connection.go new file mode 100644 index 0000000..e0e42ae --- /dev/null +++ b/meta/connection.go @@ -0,0 +1,24 @@ +package meta + +import ( + cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1" +) + +// ConnectionType returns the connection protocol. A non-empty override +// (typically AnnotationConnType) wins over OS-based inference, so a Linux +// image running xrdp can advertise rdp without faking its OS field. +func ConnectionType(osType string, hasVNCPort bool, override string) string { + if override != "" { + return override + } + switch { + case hasVNCPort: + return string(cocoonv1.ConnTypeVNC) + case osType == "android": + return string(cocoonv1.ConnTypeADB) + case osType == "windows": + return string(cocoonv1.ConnTypeRDP) + default: + return string(cocoonv1.ConnTypeSSH) + } +} diff --git a/meta/keys.go b/meta/keys.go new file mode 100644 index 0000000..bd2f70e --- /dev/null +++ b/meta/keys.go @@ -0,0 +1,88 @@ +package meta + +const ( + // APIVersion is the apiVersion string for CocoonSet resources. + APIVersion = "cocoonset.cocoonstack.io/v1" + // KindCocoonSet is the kind string for CocoonSet resources. + KindCocoonSet = "CocoonSet" + + // TolerationKey is the virtual-kubelet provider key used to gate cocoon pods onto vk-cocoon nodes. + TolerationKey = "virtual-kubelet.io/provider" + + // LabelCocoonSet stamps a pod with its owning CocoonSet name. + LabelCocoonSet = "cocoonset.cocoonstack.io/name" + // LabelRole stamps a pod with its role (main / sub-agent / toolbox). + LabelRole = "cocoonset.cocoonstack.io/role" + // LabelSlot stamps a pod with its zero-based agent slot index. + LabelSlot = "cocoonset.cocoonstack.io/slot" + + // LabelNodePool selects which cocoon node pool a pod should land on. + LabelNodePool = "cocoonstack.io/pool" + // DefaultNodePool is the pool name used when LabelNodePool is unset. + DefaultNodePool = "default" + // LabelManagedBy follows the kubernetes app convention; reserved for tagging cocoon-managed workloads. + LabelManagedBy = "app.kubernetes.io/managed-by" + + // AnnotationMode declares the VM provisioning mode (clone / run / static). + AnnotationMode = "cocoonset.cocoonstack.io/mode" + // AnnotationImage carries the VM image reference. + AnnotationImage = "cocoonset.cocoonstack.io/image" + // AnnotationStorage carries the VM root volume size (resource.Quantity). + AnnotationStorage = "cocoonset.cocoonstack.io/storage" + // AnnotationManaged marks a VM as cocoon-managed ("true") versus user-managed/static. + AnnotationManaged = "cocoonset.cocoonstack.io/managed" + // AnnotationOS carries the guest OS family (linux / windows / android). + AnnotationOS = "cocoonset.cocoonstack.io/os" + // AnnotationSnapshotPolicy carries the per-pod snapshot policy. + AnnotationSnapshotPolicy = "cocoonset.cocoonstack.io/snapshot-policy" + // AnnotationNetwork carries the cluster network to attach the VM to. + AnnotationNetwork = "cocoonset.cocoonstack.io/network" + // AnnotationForcePull bypasses the image cache when set to "true". + AnnotationForcePull = "cocoonset.cocoonstack.io/force-pull" + // AnnotationCocoonSetGeneration carries the CocoonSet generation stamped at scheduling time. + AnnotationCocoonSetGeneration = "cocoonset.cocoonstack.io/generation" + + // AnnotationVMID carries the runtime VM identifier vk-cocoon assigns after creation. + AnnotationVMID = "vm.cocoonstack.io/id" + // AnnotationVMName carries the deterministic VM name the operator builds from namespace/deployment/slot. + AnnotationVMName = "vm.cocoonstack.io/name" + // AnnotationIP carries the VM's primary IPv4 address. + AnnotationIP = "vm.cocoonstack.io/ip" + // AnnotationVNCPort carries the VM's VNC port when one is exposed. + AnnotationVNCPort = "vm.cocoonstack.io/vnc-port" + // AnnotationHibernate signals "hibernate this VM" when set to "true". + AnnotationHibernate = "vm.cocoonstack.io/hibernate" + // AnnotationForkFrom names a VM to fork the new VM from. + AnnotationForkFrom = "vm.cocoonstack.io/fork-from" + // AnnotationCloneFromDir names a host directory to clone the VM image from (vk-cocoon-specific). + AnnotationCloneFromDir = "vm.cocoonstack.io/clone-from-dir" + // AnnotationConnType overrides the connection protocol inferred from OS/runtime. + AnnotationConnType = "vm.cocoonstack.io/conn-type" + // AnnotationBackend selects the hypervisor backend (cloud-hypervisor / firecracker). + AnnotationBackend = "vm.cocoonstack.io/backend" + // AnnotationNoDirectIO disables O_DIRECT on writable disks when set to "true" (cloud-hypervisor only). + AnnotationNoDirectIO = "vm.cocoonstack.io/no-direct-io" + // AnnotationProbePort overrides the default ICMP readiness probe with a TCP port check. + AnnotationProbePort = "vm.cocoonstack.io/probe-port" + // AnnotationLifecycleState carries the vk-cocoon-reported lifecycle state. + AnnotationLifecycleState = "vm.cocoonstack.io/lifecycle-state" + // AnnotationLifecycleObservedGeneration carries the CocoonSet generation observed by vk-cocoon. + AnnotationLifecycleObservedGeneration = "vm.cocoonstack.io/lifecycle-observed-generation" + // AnnotationLifecycleStateMessage carries an optional message accompanying the lifecycle state. + AnnotationLifecycleStateMessage = "vm.cocoonstack.io/lifecycle-state-message" + + // RoleMain identifies the main agent VM (slot 0). + RoleMain = "main" + // RoleSubAgent identifies a sub-agent VM (slot > 0). + RoleSubAgent = "sub-agent" + // RoleToolbox identifies a toolbox VM. + RoleToolbox = "toolbox" + + // HibernateSnapshotTag names the snapshot tag used for hibernation. + HibernateSnapshotTag = "hibernate" + // DefaultSnapshotTag names the default snapshot tag. + DefaultSnapshotTag = "latest" + + // annotationTrue is the canonical "true" string for bool-valued annotations. + annotationTrue = "true" +) diff --git a/meta/meta.go b/meta/meta.go index 8d31dd6..c00b6ab 100644 --- a/meta/meta.go +++ b/meta/meta.go @@ -1,157 +1,8 @@ // Package meta defines shared metadata keys and naming rules used across Cocoon components. +// +// The package is split by concern: keys.go holds the annotation/label +// vocabulary, owner.go and vmname.go hold the helpers that read or build +// from that vocabulary, and connection.go derives the connection +// protocol. lifecycle.go, hibernate.go, vmspec.go, vmruntime.go, and +// pod.go layer typed contracts on top. package meta - -import ( - "slices" - "strconv" - "strings" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1" -) - -const ( - APIVersion = "cocoonset.cocoonstack.io/v1" - KindCocoonSet = "CocoonSet" - - TolerationKey = "virtual-kubelet.io/provider" - - LabelCocoonSet = "cocoonset.cocoonstack.io/name" - LabelRole = "cocoonset.cocoonstack.io/role" - LabelSlot = "cocoonset.cocoonstack.io/slot" - - LabelNodePool = "cocoonstack.io/pool" - DefaultNodePool = "default" - LabelManagedBy = "app.kubernetes.io/managed-by" - - AnnotationMode = "cocoonset.cocoonstack.io/mode" - AnnotationImage = "cocoonset.cocoonstack.io/image" - AnnotationStorage = "cocoonset.cocoonstack.io/storage" - AnnotationManaged = "cocoonset.cocoonstack.io/managed" - AnnotationOS = "cocoonset.cocoonstack.io/os" - AnnotationSnapshotPolicy = "cocoonset.cocoonstack.io/snapshot-policy" - AnnotationNetwork = "cocoonset.cocoonstack.io/network" - AnnotationForcePull = "cocoonset.cocoonstack.io/force-pull" - AnnotationCocoonSetGeneration = "cocoonset.cocoonstack.io/generation" - - AnnotationVMID = "vm.cocoonstack.io/id" - AnnotationVMName = "vm.cocoonstack.io/name" - AnnotationIP = "vm.cocoonstack.io/ip" - AnnotationVNCPort = "vm.cocoonstack.io/vnc-port" - AnnotationHibernate = "vm.cocoonstack.io/hibernate" - AnnotationForkFrom = "vm.cocoonstack.io/fork-from" - AnnotationCloneFromDir = "vm.cocoonstack.io/clone-from-dir" - AnnotationConnType = "vm.cocoonstack.io/conn-type" - AnnotationBackend = "vm.cocoonstack.io/backend" - AnnotationNoDirectIO = "vm.cocoonstack.io/no-direct-io" - AnnotationProbePort = "vm.cocoonstack.io/probe-port" - AnnotationLifecycleState = "vm.cocoonstack.io/lifecycle-state" - AnnotationLifecycleObservedGeneration = "vm.cocoonstack.io/lifecycle-observed-generation" - AnnotationLifecycleStateMessage = "vm.cocoonstack.io/lifecycle-state-message" - - RoleMain = "main" - RoleSubAgent = "sub-agent" - RoleToolbox = "toolbox" - - HibernateSnapshotTag = "hibernate" - DefaultSnapshotTag = "latest" - - annotationTrue = "true" -) - -// HasCocoonToleration reports whether the toleration list includes the virtual-kubelet provider key. -func HasCocoonToleration(tolerations []corev1.Toleration) bool { - return slices.ContainsFunc(tolerations, func(t corev1.Toleration) bool { - return t.Key == TolerationKey - }) -} - -// IsOwnedByCocoonSet reports whether any owner reference is a CocoonSet. -func IsOwnedByCocoonSet(ownerRefs []metav1.OwnerReference) bool { - return slices.ContainsFunc(ownerRefs, func(ref metav1.OwnerReference) bool { - return ref.Kind == KindCocoonSet - }) -} - -// OwnerDeploymentName extracts the deployment name from a ReplicaSet owner reference. -func OwnerDeploymentName(ownerRefs []metav1.OwnerReference) string { - for _, ref := range ownerRefs { - if ref.Kind != "ReplicaSet" { - continue - } - if before, _, ok := lastCut(ref.Name, "-"); ok { - return before - } - } - return "" -} - -// VMNameForDeployment builds a deterministic VM name from a deployment and slot index. -func VMNameForDeployment(namespace, deployment string, slot int) string { - return "vk-" + namespace + "-" + deployment + "-" + strconv.Itoa(slot) -} - -// VMNameForPod builds a deterministic VM name from a pod name. -func VMNameForPod(namespace, podName string) string { - return "vk-" + namespace + "-" + podName -} - -// ExtractSlotFromVMName parses the trailing slot index from a VM name, or -1 if absent. -func ExtractSlotFromVMName(vmName string) int { - _, after, ok := lastCut(vmName, "-") - if !ok { - return -1 - } - n, err := strconv.Atoi(after) - if err != nil { - return -1 - } - return n -} - -// MainAgentVMName replaces the slot suffix with 0. Non-slot names are returned unchanged. -func MainAgentVMName(vmName string) string { - if ExtractSlotFromVMName(vmName) < 0 { - return vmName - } - before, _, _ := lastCut(vmName, "-") - return before + "-0" -} - -// InferRoleFromVMName returns RoleMain for slot 0, RoleSubAgent otherwise. -func InferRoleFromVMName(vmName string) string { - if ExtractSlotFromVMName(vmName) == 0 { - return RoleMain - } - return RoleSubAgent -} - -// ConnectionType returns the connection protocol. A non-empty override -// (typically AnnotationConnType) wins over OS-based inference, so a Linux -// image running xrdp can advertise rdp without faking its OS field. -func ConnectionType(osType string, hasVNCPort bool, override string) string { - if override != "" { - return override - } - switch { - case hasVNCPort: - return string(cocoonv1.ConnTypeVNC) - case osType == "android": - return string(cocoonv1.ConnTypeADB) - case osType == "windows": - return string(cocoonv1.ConnTypeRDP) - default: - return string(cocoonv1.ConnTypeSSH) - } -} - -// lastCut is like strings.Cut but splits at the last occurrence of sep. -func lastCut(s, sep string) (before, after string, found bool) { - idx := strings.LastIndex(s, sep) - if idx < 0 { - return s, "", false - } - return s[:idx], s[idx+len(sep):], true -} diff --git a/meta/owner.go b/meta/owner.go new file mode 100644 index 0000000..4bc8b94 --- /dev/null +++ b/meta/owner.go @@ -0,0 +1,35 @@ +package meta + +import ( + "slices" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// HasCocoonToleration reports whether the toleration list includes the virtual-kubelet provider key. +func HasCocoonToleration(tolerations []corev1.Toleration) bool { + return slices.ContainsFunc(tolerations, func(t corev1.Toleration) bool { + return t.Key == TolerationKey + }) +} + +// IsOwnedByCocoonSet reports whether any owner reference is a CocoonSet. +func IsOwnedByCocoonSet(ownerRefs []metav1.OwnerReference) bool { + return slices.ContainsFunc(ownerRefs, func(ref metav1.OwnerReference) bool { + return ref.Kind == KindCocoonSet + }) +} + +// OwnerDeploymentName extracts the deployment name from a ReplicaSet owner reference. +func OwnerDeploymentName(ownerRefs []metav1.OwnerReference) string { + for _, ref := range ownerRefs { + if ref.Kind != "ReplicaSet" { + continue + } + if before, _, ok := lastCut(ref.Name, "-"); ok { + return before + } + } + return "" +} diff --git a/meta/vmname.go b/meta/vmname.go new file mode 100644 index 0000000..b50a84c --- /dev/null +++ b/meta/vmname.go @@ -0,0 +1,55 @@ +package meta + +import ( + "strconv" + "strings" +) + +// VMNameForDeployment builds a deterministic VM name from a deployment and slot index. +func VMNameForDeployment(namespace, deployment string, slot int) string { + return "vk-" + namespace + "-" + deployment + "-" + strconv.Itoa(slot) +} + +// VMNameForPod builds a deterministic VM name from a pod name. +func VMNameForPod(namespace, podName string) string { + return "vk-" + namespace + "-" + podName +} + +// ExtractSlotFromVMName parses the trailing slot index from a VM name, or -1 if absent. +func ExtractSlotFromVMName(vmName string) int { + _, after, ok := lastCut(vmName, "-") + if !ok { + return -1 + } + n, err := strconv.Atoi(after) + if err != nil { + return -1 + } + return n +} + +// MainAgentVMName replaces the slot suffix with 0. Non-slot names are returned unchanged. +func MainAgentVMName(vmName string) string { + if ExtractSlotFromVMName(vmName) < 0 { + return vmName + } + before, _, _ := lastCut(vmName, "-") + return before + "-0" +} + +// InferRoleFromVMName returns RoleMain for slot 0, RoleSubAgent otherwise. +func InferRoleFromVMName(vmName string) string { + if ExtractSlotFromVMName(vmName) == 0 { + return RoleMain + } + return RoleSubAgent +} + +// lastCut is like strings.Cut but splits at the last occurrence of sep. +func lastCut(s, sep string) (before, after string, found bool) { + idx := strings.LastIndex(s, sep) + if idx < 0 { + return s, "", false + } + return s[:idx], s[idx+len(sep):], true +} From a11df46e1ee61d4fcc42f76f8501616758cd07bb Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:43:57 +0800 Subject: [PATCH 08/20] refactor(meta): drop unused LabelManagedBy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LabelManagedBy was declared but never referenced — not by cocoon-common, not by any downstream consumer (vk-cocoon, cocoon-operator, cocoon-webhook, glance, epoch). Dead vocabulary in the public API costs maintenance with no payoff. --- meta/keys.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/meta/keys.go b/meta/keys.go index bd2f70e..42a4516 100644 --- a/meta/keys.go +++ b/meta/keys.go @@ -20,8 +20,6 @@ const ( LabelNodePool = "cocoonstack.io/pool" // DefaultNodePool is the pool name used when LabelNodePool is unset. DefaultNodePool = "default" - // LabelManagedBy follows the kubernetes app convention; reserved for tagging cocoon-managed workloads. - LabelManagedBy = "app.kubernetes.io/managed-by" // AnnotationMode declares the VM provisioning mode (clone / run / static). AnnotationMode = "cocoonset.cocoonstack.io/mode" From 59c391f54dc6e06024b57dd3b34e16c9580e8921 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:45:17 +0800 Subject: [PATCH 09/20] fix(log)!: return error from Setup instead of Fatalf Setup used to call Fatalf on failure, which killed the process before the caller could run defers or surface a structured exit. The new signature returns the wrapped error so callers in main packages can log via their own logger and choose the exit policy. Breaking change: all downstream consumers must check the returned error. Each consumer (vk-cocoon, cocoon-operator, cocoon-webhook, glance, epoch, cocoon-net) will be updated in a coordinated PR. --- log/log.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/log/log.go b/log/log.go index 0476576..0e4ccad 100644 --- a/log/log.go +++ b/log/log.go @@ -3,19 +3,27 @@ package log import ( "context" + "fmt" "os" corelog "github.com/projecteru2/core/log" "github.com/projecteru2/core/types" ) -// Setup initializes the core logger from envVar (default "info"). Fatals on failure. -func Setup(ctx context.Context, envVar string) { +// Setup initializes the core logger from envVar (default "info"). +// +// Returns the underlying setup error so the caller chooses the failure +// policy: a main package can Fatalf with its own logger, while a test +// or library caller can surface the error normally. The previous +// signature swallowed the error inside a Fatalf, which was unfriendly +// to callers that needed clean teardown. +func Setup(ctx context.Context, envVar string) error { level := os.Getenv(envVar) if level == "" { level = "info" } if err := corelog.SetupLog(ctx, &types.ServerLogConfig{Level: level}, ""); err != nil { - corelog.WithFunc("cocooncommon.log.Setup").Fatalf(ctx, err, "setup log: %v", err) + return fmt.Errorf("setup log: %w", err) } + return nil } From 758b4ee5bca61f8321a085a6a031644565ecd3bb Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 15:49:27 +0800 Subject: [PATCH 10/20] refactor(meta)!: return (string, bool) from OwnerDeploymentName The previous string-only return conflated three different signals into "": no owners at all, no ReplicaSet among the owners, and a ReplicaSet whose name had no hash suffix. Callers had no way to tell those apart and either treated empty as benign (silently dropping the "no owner" case) or wrote a duplicate check on the input. Return a comma-ok pair instead so the caller decides. Add table-driven tests for each branch. Breaking change: downstream callers (none today outside cocoon-common, mentioned only in cocoon-specs docs) must adopt the comma-ok form. --- meta/meta_test.go | 41 +++++++++++++++++++++++++++++++++++++---- meta/owner.go | 12 ++++++++---- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/meta/meta_test.go b/meta/meta_test.go index c093a0f..ef493ae 100644 --- a/meta/meta_test.go +++ b/meta/meta_test.go @@ -64,11 +64,44 @@ func TestConnectionType(t *testing.T) { } func TestOwnerDeploymentName(t *testing.T) { - ownerRefs := []metav1.OwnerReference{ - {Kind: "ReplicaSet", Name: "demo-7b7c9d9d5f"}, + cases := []struct { + name string + owners []metav1.OwnerReference + want string + wantOK bool + }{ + { + name: "replicaset with hash suffix", + owners: []metav1.OwnerReference{{Kind: "ReplicaSet", Name: "demo-7b7c9d9d5f"}}, + want: "demo", + wantOK: true, + }, + { + name: "no owners", + owners: nil, + wantOK: false, + }, + { + name: "non-replicaset owner", + owners: []metav1.OwnerReference{{Kind: "Deployment", Name: "demo"}}, + wantOK: false, + }, + { + name: "replicaset with no hash suffix", + owners: []metav1.OwnerReference{{Kind: "ReplicaSet", Name: "demo"}}, + wantOK: false, + }, } - if got := OwnerDeploymentName(ownerRefs); got != "demo" { - t.Fatalf("deployment name mismatch: got %q", got) + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + got, ok := OwnerDeploymentName(tt.owners) + if ok != tt.wantOK { + t.Fatalf("ok = %v, want %v", ok, tt.wantOK) + } + if got != tt.want { + t.Errorf("name = %q, want %q", got, tt.want) + } + }) } } diff --git a/meta/owner.go b/meta/owner.go index 4bc8b94..4f1e74d 100644 --- a/meta/owner.go +++ b/meta/owner.go @@ -21,15 +21,19 @@ func IsOwnedByCocoonSet(ownerRefs []metav1.OwnerReference) bool { }) } -// OwnerDeploymentName extracts the deployment name from a ReplicaSet owner reference. -func OwnerDeploymentName(ownerRefs []metav1.OwnerReference) string { +// OwnerDeploymentName extracts the deployment name from a ReplicaSet +// owner reference. Returns ok=false when no ReplicaSet owner is present +// or its name has no recognizable hash suffix — that lets the caller +// distinguish "no owning deployment" from a legitimately empty name +// instead of conflating both into an empty string. +func OwnerDeploymentName(ownerRefs []metav1.OwnerReference) (string, bool) { for _, ref := range ownerRefs { if ref.Kind != "ReplicaSet" { continue } if before, _, ok := lastCut(ref.Name, "-"); ok { - return before + return before, true } } - return "" + return "", false } From 2127a96fabd3cd17f69414bba060b821845d80c7 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:03:38 +0800 Subject: [PATCH 11/20] fix(k8s)!: DetectNodeIP returns (string, error) Returning "127.0.0.1" when net.InterfaceAddrs failed (or returned only loopback / IPv6) made misconfigured hosts look healthy: a vk-cocoon that couldn't see its own primary IP would advertise localhost, register fine with the API server, then break every kubelet TLS handshake from outside. Surface the failure instead. Callers that want the previous fallback write `if err != nil { ip = "127.0.0.1" }` and own the decision. ErrNoNodeIP is exported so callers can branch on the "no IPv4 interface" case specifically. Breaking change: vk-cocoon main.go must adopt the new signature in a coordinated PR. --- k8s/netip.go | 28 ++++++++++++++++++++++------ k8s/tls_test.go | 12 ++++++++++-- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/k8s/netip.go b/k8s/netip.go index f03e88e..f3abef9 100644 --- a/k8s/netip.go +++ b/k8s/netip.go @@ -1,12 +1,28 @@ package k8s -import "net" +import ( + "errors" + "fmt" + "net" +) -// DetectNodeIP returns the first non-loopback IPv4 address, or "127.0.0.1" if none found. -func DetectNodeIP() string { +// ErrNoNodeIP is returned by DetectNodeIP when no non-loopback IPv4 +// address is reachable. Callers decide whether to fall back to a +// configured default — silently substituting 127.0.0.1 was the +// previous behavior, which masks misconfigured network namespaces. +var ErrNoNodeIP = errors.New("no non-loopback IPv4 address found") + +// DetectNodeIP returns the first non-loopback IPv4 address. +// +// Returns a wrapped net.InterfaceAddrs error when the host network +// stack is unavailable, or ErrNoNodeIP when every interface is +// loopback / IPv6-only. The previous implementation returned +// "127.0.0.1" in both failure modes, which made misconfigured hosts +// appear healthy. +func DetectNodeIP() (string, error) { addrs, err := net.InterfaceAddrs() if err != nil { - return localhost + return "", fmt.Errorf("list interface addresses: %w", err) } for _, addr := range addrs { ipNet, ok := addr.(*net.IPNet) @@ -14,8 +30,8 @@ func DetectNodeIP() string { continue } if ip4 := ipNet.IP.To4(); ip4 != nil { - return ip4.String() + return ip4.String(), nil } } - return localhost + return "", ErrNoNodeIP } diff --git a/k8s/tls_test.go b/k8s/tls_test.go index dd1f893..c4a4da5 100644 --- a/k8s/tls_test.go +++ b/k8s/tls_test.go @@ -144,7 +144,15 @@ func TestLoadOrGenerateCertExpiredDiskCertFallsBack(t *testing.T) { } func TestDetectNodeIPReturnsSomething(t *testing.T) { - if got := DetectNodeIP(); got == "" { - t.Errorf("DetectNodeIP returned empty string") + // CI hosts are expected to have at least one non-loopback IPv4 + // interface; if not, the call should return ErrNoNodeIP so this + // test surfaces the environmental issue rather than passing on + // the previous silent fallback to localhost. + got, err := DetectNodeIP() + if err != nil { + t.Skipf("no non-loopback IPv4 on this host: %v", err) + } + if got == "" { + t.Errorf("DetectNodeIP returned empty string with no error") } } From dcb11a9946d7be4845c83da305710294bf236271 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:04:24 +0800 Subject: [PATCH 12/20] refactor(meta)!: rename HasCocoonToleration to HasCocoonTolerationKey The function only checks the Toleration Key; Operator, Value, and Effect are ignored. The previous name implied a stronger match than the implementation gave, so any caller wanting strict semantics had to read the body to find out. Rename to make the looseness explicit: the cocoon-webhook gate is intentionally permissive and a stricter check belongs at the call site. Breaking change: downstream callers (cocoon-webhook, glance) must adopt the new name. --- meta/meta_test.go | 10 ++++++++-- meta/owner.go | 9 +++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/meta/meta_test.go b/meta/meta_test.go index ef493ae..453ed46 100644 --- a/meta/meta_test.go +++ b/meta/meta_test.go @@ -105,9 +105,15 @@ func TestOwnerDeploymentName(t *testing.T) { } } -func TestHasCocoonToleration(t *testing.T) { +func TestHasCocoonTolerationKey(t *testing.T) { tolerations := []corev1.Toleration{{Key: TolerationKey}} - if !HasCocoonToleration(tolerations) { + if !HasCocoonTolerationKey(tolerations) { t.Fatalf("expected toleration to be detected") } + if HasCocoonTolerationKey(nil) { + t.Errorf("expected nil tolerations to be rejected") + } + if HasCocoonTolerationKey([]corev1.Toleration{{Key: "other"}}) { + t.Errorf("expected unrelated toleration to be rejected") + } } diff --git a/meta/owner.go b/meta/owner.go index 4f1e74d..965033d 100644 --- a/meta/owner.go +++ b/meta/owner.go @@ -7,8 +7,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// HasCocoonToleration reports whether the toleration list includes the virtual-kubelet provider key. -func HasCocoonToleration(tolerations []corev1.Toleration) bool { +// HasCocoonTolerationKey reports whether the toleration list includes +// an entry whose Key matches TolerationKey. Operator/Value/Effect are +// ignored — the cocoon-webhook gate is intentionally permissive to +// accept any toleration spelling that targets the virtual-kubelet +// taint. Use a stricter check at the call site if you need to match a +// specific Operator or Effect. +func HasCocoonTolerationKey(tolerations []corev1.Toleration) bool { return slices.ContainsFunc(tolerations, func(t corev1.Toleration) bool { return t.Key == TolerationKey }) From 4c7f78af0f9c9d17298a898f3ad042c6eec4b761 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:06:43 +0800 Subject: [PATCH 13/20] feat(meta): add toolbox-safe agent-slot helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy ExtractSlotFromVMName / MainAgentVMName / InferRoleFromVMName trio split on the last dash, which misclassifies a toolbox whose pod-name happens to end in -N (e.g. toolbox "db-2" produces vmName vk-NS-CS-db-2 and reads as slot 2). The bug surfaces in glance role inference and in vmspec.ShouldSnapshotVM under MainOnly policy. Add a (namespace, cocoonSet)-anchored variant family: AgentVMNamePrefix shared agent-VM-name prefix builder ExtractAgentSlot parses the slot only when the prefix matches and the suffix has no inner dash MainAgentVMNameFor constant function — no parsing involved InferRoleFromAgentSlot pure int -> role mapping Existing helpers are kept and marked Deprecated in the godoc so downstream consumers can migrate one call site at a time. --- meta/meta_test.go | 86 +++++++++++++++++++++++++++++++++++++++++++++++ meta/vmname.go | 69 +++++++++++++++++++++++++++++++++++-- 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/meta/meta_test.go b/meta/meta_test.go index 453ed46..0a30035 100644 --- a/meta/meta_test.go +++ b/meta/meta_test.go @@ -39,6 +39,92 @@ func TestInferRoleFromVMName(t *testing.T) { } } +func TestExtractAgentSlot(t *testing.T) { + cases := []struct { + name string + ns string + cocoonSet string + vmName string + want int + }{ + { + name: "main agent", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-prod-demo-0", + want: 0, + }, + { + name: "sub-agent", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-prod-demo-3", + want: 3, + }, + { + // The legacy ExtractSlotFromVMName would mis-read this as + // slot 2 because it splits at the last dash. ExtractAgentSlot + // rejects it because the suffix after the agent prefix + // contains a dash. + name: "toolbox with trailing digit is not an agent slot", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-prod-demo-db-2", + want: -1, + }, + { + name: "toolbox without trailing digit", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-prod-demo-toolbox", + want: -1, + }, + { + name: "different cocoonset", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-prod-other-0", + want: -1, + }, + { + name: "different namespace", + ns: "prod", + cocoonSet: "demo", + vmName: "vk-staging-demo-0", + want: -1, + }, + { + name: "non-vk prefix", + ns: "prod", + cocoonSet: "demo", + vmName: "prod-demo-0", + want: -1, + }, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + if got := ExtractAgentSlot(tt.ns, tt.cocoonSet, tt.vmName); got != tt.want { + t.Errorf("ExtractAgentSlot(%q,%q,%q) = %d, want %d", tt.ns, tt.cocoonSet, tt.vmName, got, tt.want) + } + }) + } +} + +func TestMainAgentVMNameFor(t *testing.T) { + if got := MainAgentVMNameFor("prod", "demo"); got != "vk-prod-demo-0" { + t.Errorf("got %q, want %q", got, "vk-prod-demo-0") + } +} + +func TestInferRoleFromAgentSlot(t *testing.T) { + if got := InferRoleFromAgentSlot(0); got != RoleMain { + t.Errorf("slot 0 = %q, want %q", got, RoleMain) + } + if got := InferRoleFromAgentSlot(7); got != RoleSubAgent { + t.Errorf("slot 7 = %q, want %q", got, RoleSubAgent) + } +} + func TestConnectionType(t *testing.T) { cases := []struct { name string diff --git a/meta/vmname.go b/meta/vmname.go index b50a84c..13b6f1a 100644 --- a/meta/vmname.go +++ b/meta/vmname.go @@ -15,7 +15,62 @@ func VMNameForPod(namespace, podName string) string { return "vk-" + namespace + "-" + podName } -// ExtractSlotFromVMName parses the trailing slot index from a VM name, or -1 if absent. +// AgentVMNamePrefix returns the shared prefix every agent VM name for +// (namespace, cocoonSet) starts with: "vk-NAMESPACE-COCOONSET-". +// Use it together with ExtractAgentSlot to disambiguate agent VM names +// from toolbox VM names whose pod-derived name happens to end in -N. +func AgentVMNamePrefix(namespace, cocoonSet string) string { + return "vk-" + namespace + "-" + cocoonSet + "-" +} + +// ExtractAgentSlot parses the trailing slot index from vmName when it +// matches the agent naming convention for (namespace, cocoonSet). +// Returns -1 for any toolbox VM name even when its pod-name suffix is +// numeric (e.g. a toolbox named "db-2" produces vmName +// "vk-NS-CS-db-2" which the legacy ExtractSlotFromVMName would parse +// as slot 2 — that is a misclassification this function is meant to +// prevent). +func ExtractAgentSlot(namespace, cocoonSet, vmName string) int { + prefix := AgentVMNamePrefix(namespace, cocoonSet) + suffix, ok := strings.CutPrefix(vmName, prefix) + if !ok || strings.Contains(suffix, "-") { + return -1 + } + n, err := strconv.Atoi(suffix) + if err != nil || n < 0 { + return -1 + } + return n +} + +// MainAgentVMNameFor returns the VM name of the main (slot 0) agent +// for (namespace, cocoonSet). Unlike MainAgentVMName it is a constant +// function of its inputs — no parsing — so a toolbox whose name ends +// in -N no longer hijacks the "main agent" rewrite path. +func MainAgentVMNameFor(namespace, cocoonSet string) string { + return VMNameForDeployment(namespace, cocoonSet, 0) +} + +// InferRoleFromAgentSlot returns RoleMain for slot 0 and RoleSubAgent +// for any positive slot. Callers that have already classified a pod as +// an agent (via labels or the AgentVMNamePrefix match) should pass the +// slot they extracted; pods classified as toolboxes belong to +// RoleToolbox and should not be routed through this helper. +func InferRoleFromAgentSlot(slot int) string { + if slot == 0 { + return RoleMain + } + return RoleSubAgent +} + +// ExtractSlotFromVMName parses the trailing slot index from a VM name, +// or -1 if absent. +// +// Deprecated: this helper has no knowledge of agent-versus-toolbox +// naming, so it misclassifies toolbox VM names whose pod-derived +// suffix happens to be numeric (e.g. a toolbox named "db-2" reads as +// slot 2). Prefer ExtractAgentSlot for new code; existing callers that +// know they are looking at agent VM names can keep using this for now. func ExtractSlotFromVMName(vmName string) int { _, after, ok := lastCut(vmName, "-") if !ok { @@ -28,7 +83,11 @@ func ExtractSlotFromVMName(vmName string) int { return n } -// MainAgentVMName replaces the slot suffix with 0. Non-slot names are returned unchanged. +// MainAgentVMName replaces the slot suffix with 0. Non-slot names are +// returned unchanged. +// +// Deprecated: shares the toolbox-collision bug of ExtractSlotFromVMName. +// Prefer MainAgentVMNameFor(namespace, cocoonSet) for new code. func MainAgentVMName(vmName string) string { if ExtractSlotFromVMName(vmName) < 0 { return vmName @@ -38,6 +97,12 @@ func MainAgentVMName(vmName string) string { } // InferRoleFromVMName returns RoleMain for slot 0, RoleSubAgent otherwise. +// +// Deprecated: shares the toolbox-collision bug of ExtractSlotFromVMName. +// Prefer InferRoleFromAgentSlot(ExtractAgentSlot(ns, cs, vmName)) for +// new code; callers that need to handle toolbox VM names should branch +// on AgentVMNamePrefix first and route toolboxes to RoleToolbox +// explicitly. func InferRoleFromVMName(vmName string) string { if ExtractSlotFromVMName(vmName) == 0 { return RoleMain From 4a444900c435b02b6898305f8a3beaf5c42a4368 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:17:29 +0800 Subject: [PATCH 14/20] style: drop "previous behavior" leaks from godoc The PR-context framing ("the previous implementation returned X") belongs in the commit message, not in code that has to live with every future reader. --- k8s/netip.go | 8 +++----- k8s/tls_test.go | 5 ++--- log/log.go | 4 +--- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/k8s/netip.go b/k8s/netip.go index f3abef9..e18404f 100644 --- a/k8s/netip.go +++ b/k8s/netip.go @@ -8,17 +8,15 @@ import ( // ErrNoNodeIP is returned by DetectNodeIP when no non-loopback IPv4 // address is reachable. Callers decide whether to fall back to a -// configured default — silently substituting 127.0.0.1 was the -// previous behavior, which masks misconfigured network namespaces. +// configured default; auto-substituting 127.0.0.1 would mask +// misconfigured network namespaces. var ErrNoNodeIP = errors.New("no non-loopback IPv4 address found") // DetectNodeIP returns the first non-loopback IPv4 address. // // Returns a wrapped net.InterfaceAddrs error when the host network // stack is unavailable, or ErrNoNodeIP when every interface is -// loopback / IPv6-only. The previous implementation returned -// "127.0.0.1" in both failure modes, which made misconfigured hosts -// appear healthy. +// loopback / IPv6-only. func DetectNodeIP() (string, error) { addrs, err := net.InterfaceAddrs() if err != nil { diff --git a/k8s/tls_test.go b/k8s/tls_test.go index c4a4da5..3cd77cb 100644 --- a/k8s/tls_test.go +++ b/k8s/tls_test.go @@ -145,9 +145,8 @@ func TestLoadOrGenerateCertExpiredDiskCertFallsBack(t *testing.T) { func TestDetectNodeIPReturnsSomething(t *testing.T) { // CI hosts are expected to have at least one non-loopback IPv4 - // interface; if not, the call should return ErrNoNodeIP so this - // test surfaces the environmental issue rather than passing on - // the previous silent fallback to localhost. + // interface; skip when none is present so the test reflects the + // environment rather than masking it as a pass. got, err := DetectNodeIP() if err != nil { t.Skipf("no non-loopback IPv4 on this host: %v", err) diff --git a/log/log.go b/log/log.go index 0e4ccad..6d908d2 100644 --- a/log/log.go +++ b/log/log.go @@ -14,9 +14,7 @@ import ( // // Returns the underlying setup error so the caller chooses the failure // policy: a main package can Fatalf with its own logger, while a test -// or library caller can surface the error normally. The previous -// signature swallowed the error inside a Fatalf, which was unfriendly -// to callers that needed clean teardown. +// or library caller can surface the error and tear down cleanly. func Setup(ctx context.Context, envVar string) error { level := os.Getenv(envVar) if level == "" { From 6b51984c9a76964de09391459deec66280ec7718 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:51:00 +0800 Subject: [PATCH 15/20] style: tighten godocs across packages Trim narrative godocs that restate the signature or describe historical behavior. Keep only the contract and non-obvious WHY. --- auth/session.go | 4 +--- auth/state.go | 6 +----- k8s/netip.go | 14 +++++--------- log/log.go | 4 ---- meta/meta.go | 9 ++------- meta/owner.go | 14 ++++---------- meta/vmname.go | 37 ++++++++++--------------------------- 7 files changed, 23 insertions(+), 65 deletions(-) diff --git a/auth/session.go b/auth/session.go index 9da62cb..1609044 100644 --- a/auth/session.go +++ b/auth/session.go @@ -33,9 +33,7 @@ func SignSession(sess Session, key []byte) (string, error) { } // VerifySession validates the HMAC signature and decodes the session. -// Returns nil and false if the signature is invalid, decoding fails, or -// the session has expired (Exp set and in the past). Exp == 0 means -// "no expiry" and is accepted unconditionally. +// Exp == 0 means "no expiry". func VerifySession(cookie string, key []byte) (*Session, bool) { payload, sig, ok := strings.Cut(cookie, ".") if !ok { diff --git a/auth/state.go b/auth/state.go index cee8ae6..d3c52b3 100644 --- a/auth/state.go +++ b/auth/state.go @@ -8,11 +8,7 @@ import ( // RandomState returns a cryptographically random 32-character hex string // suitable for OAuth state parameters and CSRF nonces. -// -// Panics if crypto/rand.Read fails. On supported platforms the syscall -// is documented to never fail; a failure indicates the OS RNG is -// unavailable, in which case continuing with a weak nonce would silently -// compromise CSRF protection. +// Panics on crypto/rand failure — a weak nonce silently breaks CSRF. func RandomState() string { b := make([]byte, 16) if _, err := rand.Read(b); err != nil { diff --git a/k8s/netip.go b/k8s/netip.go index e18404f..843772b 100644 --- a/k8s/netip.go +++ b/k8s/netip.go @@ -6,17 +6,13 @@ import ( "net" ) -// ErrNoNodeIP is returned by DetectNodeIP when no non-loopback IPv4 -// address is reachable. Callers decide whether to fall back to a -// configured default; auto-substituting 127.0.0.1 would mask -// misconfigured network namespaces. +// ErrNoNodeIP is returned when no non-loopback IPv4 address is +// reachable. Callers pick the fallback — auto-substituting localhost +// would mask misconfigured network namespaces. var ErrNoNodeIP = errors.New("no non-loopback IPv4 address found") -// DetectNodeIP returns the first non-loopback IPv4 address. -// -// Returns a wrapped net.InterfaceAddrs error when the host network -// stack is unavailable, or ErrNoNodeIP when every interface is -// loopback / IPv6-only. +// DetectNodeIP returns the first non-loopback IPv4 address, or +// ErrNoNodeIP if none exists. func DetectNodeIP() (string, error) { addrs, err := net.InterfaceAddrs() if err != nil { diff --git a/log/log.go b/log/log.go index 6d908d2..6fe65cd 100644 --- a/log/log.go +++ b/log/log.go @@ -11,10 +11,6 @@ import ( ) // Setup initializes the core logger from envVar (default "info"). -// -// Returns the underlying setup error so the caller chooses the failure -// policy: a main package can Fatalf with its own logger, while a test -// or library caller can surface the error and tear down cleanly. func Setup(ctx context.Context, envVar string) error { level := os.Getenv(envVar) if level == "" { diff --git a/meta/meta.go b/meta/meta.go index c00b6ab..b7465e8 100644 --- a/meta/meta.go +++ b/meta/meta.go @@ -1,8 +1,3 @@ -// Package meta defines shared metadata keys and naming rules used across Cocoon components. -// -// The package is split by concern: keys.go holds the annotation/label -// vocabulary, owner.go and vmname.go hold the helpers that read or build -// from that vocabulary, and connection.go derives the connection -// protocol. lifecycle.go, hibernate.go, vmspec.go, vmruntime.go, and -// pod.go layer typed contracts on top. +// Package meta defines shared metadata keys and naming rules used +// across Cocoon components. package meta diff --git a/meta/owner.go b/meta/owner.go index 965033d..9d0138a 100644 --- a/meta/owner.go +++ b/meta/owner.go @@ -7,12 +7,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// HasCocoonTolerationKey reports whether the toleration list includes -// an entry whose Key matches TolerationKey. Operator/Value/Effect are -// ignored — the cocoon-webhook gate is intentionally permissive to -// accept any toleration spelling that targets the virtual-kubelet -// taint. Use a stricter check at the call site if you need to match a -// specific Operator or Effect. +// HasCocoonTolerationKey reports whether tolerations include an entry +// whose Key matches TolerationKey. Operator/Value/Effect are ignored — +// the cocoon-webhook gate is intentionally permissive. func HasCocoonTolerationKey(tolerations []corev1.Toleration) bool { return slices.ContainsFunc(tolerations, func(t corev1.Toleration) bool { return t.Key == TolerationKey @@ -27,10 +24,7 @@ func IsOwnedByCocoonSet(ownerRefs []metav1.OwnerReference) bool { } // OwnerDeploymentName extracts the deployment name from a ReplicaSet -// owner reference. Returns ok=false when no ReplicaSet owner is present -// or its name has no recognizable hash suffix — that lets the caller -// distinguish "no owning deployment" from a legitimately empty name -// instead of conflating both into an empty string. +// owner reference. Returns ok=false when absent or unparseable. func OwnerDeploymentName(ownerRefs []metav1.OwnerReference) (string, bool) { for _, ref := range ownerRefs { if ref.Kind != "ReplicaSet" { diff --git a/meta/vmname.go b/meta/vmname.go index 13b6f1a..cbdbbbd 100644 --- a/meta/vmname.go +++ b/meta/vmname.go @@ -15,21 +15,15 @@ func VMNameForPod(namespace, podName string) string { return "vk-" + namespace + "-" + podName } -// AgentVMNamePrefix returns the shared prefix every agent VM name for -// (namespace, cocoonSet) starts with: "vk-NAMESPACE-COCOONSET-". -// Use it together with ExtractAgentSlot to disambiguate agent VM names -// from toolbox VM names whose pod-derived name happens to end in -N. +// AgentVMNamePrefix returns "vk-NAMESPACE-COCOONSET-", the prefix every +// agent VM name shares. func AgentVMNamePrefix(namespace, cocoonSet string) string { return "vk-" + namespace + "-" + cocoonSet + "-" } // ExtractAgentSlot parses the trailing slot index from vmName when it -// matches the agent naming convention for (namespace, cocoonSet). -// Returns -1 for any toolbox VM name even when its pod-name suffix is -// numeric (e.g. a toolbox named "db-2" produces vmName -// "vk-NS-CS-db-2" which the legacy ExtractSlotFromVMName would parse -// as slot 2 — that is a misclassification this function is meant to -// prevent). +// matches the agent naming convention for (namespace, cocoonSet), or +// -1 for any toolbox VM name (e.g. "vk-NS-CS-db-2"). func ExtractAgentSlot(namespace, cocoonSet, vmName string) int { prefix := AgentVMNamePrefix(namespace, cocoonSet) suffix, ok := strings.CutPrefix(vmName, prefix) @@ -44,18 +38,13 @@ func ExtractAgentSlot(namespace, cocoonSet, vmName string) int { } // MainAgentVMNameFor returns the VM name of the main (slot 0) agent -// for (namespace, cocoonSet). Unlike MainAgentVMName it is a constant -// function of its inputs — no parsing — so a toolbox whose name ends -// in -N no longer hijacks the "main agent" rewrite path. +// for (namespace, cocoonSet). func MainAgentVMNameFor(namespace, cocoonSet string) string { return VMNameForDeployment(namespace, cocoonSet, 0) } -// InferRoleFromAgentSlot returns RoleMain for slot 0 and RoleSubAgent -// for any positive slot. Callers that have already classified a pod as -// an agent (via labels or the AgentVMNamePrefix match) should pass the -// slot they extracted; pods classified as toolboxes belong to -// RoleToolbox and should not be routed through this helper. +// InferRoleFromAgentSlot returns RoleMain for slot 0, RoleSubAgent +// otherwise. Toolboxes use RoleToolbox; do not route them through here. func InferRoleFromAgentSlot(slot int) string { if slot == 0 { return RoleMain @@ -66,11 +55,8 @@ func InferRoleFromAgentSlot(slot int) string { // ExtractSlotFromVMName parses the trailing slot index from a VM name, // or -1 if absent. // -// Deprecated: this helper has no knowledge of agent-versus-toolbox -// naming, so it misclassifies toolbox VM names whose pod-derived -// suffix happens to be numeric (e.g. a toolbox named "db-2" reads as -// slot 2). Prefer ExtractAgentSlot for new code; existing callers that -// know they are looking at agent VM names can keep using this for now. +// Deprecated: misclassifies toolbox names with numeric suffixes (e.g. +// "vk-NS-CS-db-2" → slot 2). Prefer ExtractAgentSlot. func ExtractSlotFromVMName(vmName string) int { _, after, ok := lastCut(vmName, "-") if !ok { @@ -99,10 +85,7 @@ func MainAgentVMName(vmName string) string { // InferRoleFromVMName returns RoleMain for slot 0, RoleSubAgent otherwise. // // Deprecated: shares the toolbox-collision bug of ExtractSlotFromVMName. -// Prefer InferRoleFromAgentSlot(ExtractAgentSlot(ns, cs, vmName)) for -// new code; callers that need to handle toolbox VM names should branch -// on AgentVMNamePrefix first and route toolboxes to RoleToolbox -// explicitly. +// Prefer InferRoleFromAgentSlot(ExtractAgentSlot(ns, cs, vmName)). func InferRoleFromVMName(vmName string) string { if ExtractSlotFromVMName(vmName) == 0 { return RoleMain From 91349704f58c5848b8006af547032a3c4e6b6ad6 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 16:51:13 +0800 Subject: [PATCH 16/20] refactor!: thread ctx through LoadOrGenerateCert; tighten meta helpers k8s/tls.go: LoadOrGenerateCert now takes ctx as its first argument and threads it through tryLoadDiskCert and isCertExpired so library logging no longer fabricates context.Background(). Drop the redundant os.Stat pre-check in tryLoadDiskCert; LoadX509KeyPair's fs.ErrNotExist covers the "missing cert" path and stops swallowing permission errors. Capture time.Now() once in GenerateSelfSignedCert. meta/lifecycle.go: inline LifecycleStatus.Apply so the per-pod reconcile path no longer allocates a throwaway map[string]any with boxed values just to iterate it. meta/connection.go: compare against string(cocoonv1.OS*) instead of raw "android"/"windows" literals so renames break the build. --- k8s/tls.go | 56 +++++++++++++++++++++------------------------- k8s/tls_test.go | 6 ++--- meta/connection.go | 7 +++--- meta/lifecycle.go | 44 ++++++++++++++---------------------- 4 files changed, 48 insertions(+), 65 deletions(-) diff --git a/k8s/tls.go b/k8s/tls.go index 3d9be4b..ef16bbe 100644 --- a/k8s/tls.go +++ b/k8s/tls.go @@ -9,10 +9,11 @@ import ( "crypto/x509" "crypto/x509/pkix" "encoding/pem" + "errors" "fmt" + "io/fs" "math/big" "net" - "os" "time" "github.com/projecteru2/core/log" @@ -20,15 +21,11 @@ import ( const localhost = "127.0.0.1" -// LoadOrGenerateCert loads a TLS keypair from disk, falling back to a self-signed cert. -// Returns a source label for logging ("disk " or "self-signed"). -// -// If the on-disk cert is already expired, a warning is logged and the -// function falls through to mint a self-signed cert — kubelets that -// keep running with a stale cert appear healthy but get rejected by the -// API server with an opaque TLS error. -func LoadOrGenerateCert(certPath, keyPath, hostname, ip string) (tls.Certificate, string, error) { - cert, source, err := tryLoadDiskCert(certPath, keyPath) +// LoadOrGenerateCert loads a TLS keypair from disk, falling back to a +// self-signed cert when paths are empty, the cert is missing, or the +// cert is expired. Returns a source label for logging. +func LoadOrGenerateCert(ctx context.Context, certPath, keyPath, hostname, ip string) (tls.Certificate, string, error) { + cert, source, err := tryLoadDiskCert(ctx, certPath, keyPath) if err != nil { return tls.Certificate{}, "", err } @@ -42,61 +39,58 @@ func LoadOrGenerateCert(certPath, keyPath, hostname, ip string) (tls.Certificate return cert, "self-signed", nil } -// tryLoadDiskCert attempts to load the keypair at the configured paths. -// Returns ("", "", nil) when the caller should fall back to a self-signed -// cert (paths empty, cert missing, or cert expired) and propagates a -// non-nil error only when the keypair load itself fails — that signals -// operator misconfiguration which silently substituting a self-signed -// cert would mask. -func tryLoadDiskCert(certPath, keyPath string) (tls.Certificate, string, error) { +// tryLoadDiskCert returns ("", "", nil) when the caller should fall +// back to self-signed (paths empty, cert missing, or expired) and an +// error only when a configured keypair fails to load. +func tryLoadDiskCert(ctx context.Context, certPath, keyPath string) (tls.Certificate, string, error) { if certPath == "" || keyPath == "" { return tls.Certificate{}, "", nil } - if _, err := os.Stat(certPath); err != nil { - return tls.Certificate{}, "", nil - } cert, err := tls.LoadX509KeyPair(certPath, keyPath) if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return tls.Certificate{}, "", nil + } return tls.Certificate{}, "", fmt.Errorf("load tls keypair %s: %w", certPath, err) } - if isCertExpired(cert, certPath) { + if isCertExpired(ctx, cert, certPath) { return tls.Certificate{}, "", nil } return cert, fmt.Sprintf("disk %s", certPath), nil } -// isCertExpired returns true when the on-disk leaf cert is past its -// NotAfter. Parse failures are logged as warnings (not fatal) and -// treated as "not expired" so a load that succeeded at the tls layer -// is not gratuitously discarded. -func isCertExpired(cert tls.Certificate, certPath string) bool { +// isCertExpired returns true when the leaf cert is past NotAfter. +// Parse failures are warned and treated as "not expired". +func isCertExpired(ctx context.Context, cert tls.Certificate, certPath string) bool { logger := log.WithFunc("k8s.LoadOrGenerateCert") if len(cert.Certificate) == 0 { return false } parsed, err := x509.ParseCertificate(cert.Certificate[0]) if err != nil { - logger.Warnf(context.Background(), "parse disk cert %s: %v (keeping cert)", certPath, err) + logger.Warnf(ctx, "parse disk cert %s: %v (keeping cert)", certPath, err) return false } if time.Now().After(parsed.NotAfter) { - logger.Warnf(context.Background(), "disk cert %s expired at %s, falling back to self-signed", certPath, parsed.NotAfter.Format(time.RFC3339)) + logger.Warnf(ctx, "disk cert %s expired at %s, falling back to self-signed", certPath, parsed.NotAfter.Format(time.RFC3339)) return true } return false } -// GenerateSelfSignedCert creates an in-memory ECDSA P-256 self-signed cert for hostname and ip. +// GenerateSelfSignedCert creates an in-memory ECDSA P-256 self-signed +// cert for hostname and ip. func GenerateSelfSignedCert(hostname, ip string) (tls.Certificate, error) { priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) if err != nil { return tls.Certificate{}, err } + now := time.Now() template := x509.Certificate{ SerialNumber: big.NewInt(1), Subject: pkix.Name{CommonName: hostname}, - NotBefore: time.Now(), - NotAfter: time.Now().Add(10 * 365 * 24 * time.Hour), + NotBefore: now, + NotAfter: now.Add(10 * 365 * 24 * time.Hour), KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, DNSNames: []string{hostname, "localhost"}, diff --git a/k8s/tls_test.go b/k8s/tls_test.go index 3cd77cb..b66cf96 100644 --- a/k8s/tls_test.go +++ b/k8s/tls_test.go @@ -42,7 +42,7 @@ func TestGenerateSelfSignedCertIsParseable(t *testing.T) { } func TestLoadOrGenerateCertFallsBackToSelfSigned(t *testing.T) { - cert, source, err := LoadOrGenerateCert("/does/not/exist.crt", "/does/not/exist.key", "host", "10.0.0.1") + cert, source, err := LoadOrGenerateCert(t.Context(), "/does/not/exist.crt", "/does/not/exist.key", "host", "10.0.0.1") if err != nil { t.Fatalf("fallback: %v", err) } @@ -91,7 +91,7 @@ func TestLoadOrGenerateCertLoadsFromDisk(t *testing.T) { t.Fatalf("write key: %v", err) } - _, source, err := LoadOrGenerateCert(certPath, keyPath, "host", "10.0.0.1") + _, source, err := LoadOrGenerateCert(t.Context(), certPath, keyPath, "host", "10.0.0.1") if err != nil { t.Fatalf("load: %v", err) } @@ -134,7 +134,7 @@ func TestLoadOrGenerateCertExpiredDiskCertFallsBack(t *testing.T) { t.Fatalf("write key: %v", err) } - _, source, err := LoadOrGenerateCert(certPath, keyPath, "host", "10.0.0.1") + _, source, err := LoadOrGenerateCert(t.Context(), certPath, keyPath, "host", "10.0.0.1") if err != nil { t.Fatalf("load: %v", err) } diff --git a/meta/connection.go b/meta/connection.go index e0e42ae..0ef620a 100644 --- a/meta/connection.go +++ b/meta/connection.go @@ -5,8 +5,7 @@ import ( ) // ConnectionType returns the connection protocol. A non-empty override -// (typically AnnotationConnType) wins over OS-based inference, so a Linux -// image running xrdp can advertise rdp without faking its OS field. +// wins over OS-based inference (e.g. Linux + xrdp → rdp). func ConnectionType(osType string, hasVNCPort bool, override string) string { if override != "" { return override @@ -14,9 +13,9 @@ func ConnectionType(osType string, hasVNCPort bool, override string) string { switch { case hasVNCPort: return string(cocoonv1.ConnTypeVNC) - case osType == "android": + case osType == string(cocoonv1.OSAndroid): return string(cocoonv1.ConnTypeADB) - case osType == "windows": + case osType == string(cocoonv1.OSWindows): return string(cocoonv1.ConnTypeRDP) default: return string(cocoonv1.ConnTypeSSH) diff --git a/meta/lifecycle.go b/meta/lifecycle.go index 08fe85e..93db7e1 100644 --- a/meta/lifecycle.go +++ b/meta/lifecycle.go @@ -14,16 +14,14 @@ const ( LifecycleStateFailed LifecycleState = "failed" ) -// terminalStates is the lookup set IsTerminal consults — keep it in sync -// with the const block above. var terminalStates = map[LifecycleState]struct{}{ LifecycleStateReady: {}, LifecycleStateHibernated: {}, LifecycleStateFailed: {}, } -// LifecycleState is the typed contract for the lifecycle-state annotation -// vk-cocoon publishes on a Pod. +// LifecycleState is the typed contract for the lifecycle-state +// annotation vk-cocoon publishes on a Pod. type LifecycleState string // IsTerminal reports whether s is a state a client would wait for. @@ -33,19 +31,14 @@ func (s LifecycleState) IsTerminal() bool { } // LifecycleStatus is the full triple (state, observed-generation, message). -// Annotations is the source of truth for what gets written; Apply -// consumes the same map in-memory and Snapshot derives a comparison -// key from the same fields. type LifecycleStatus struct { State LifecycleState ObservedGeneration int64 Message string } -// Annotations returns the lifecycle annotation map for s. nil entries -// signal "delete this key" — pass to k8s.AnnotationsMergePatch to wrap -// into a `metadata.annotations` merge-patch body, or iterate directly -// to mutate an in-memory pod (see Apply). +// Annotations returns the lifecycle annotation map for a merge patch. +// Nil values signal "delete this key". func (s LifecycleStatus) Annotations() map[string]any { annos := map[string]any{ AnnotationLifecycleState: string(s.State), @@ -59,20 +52,20 @@ func (s LifecycleStatus) Annotations() map[string]any { return annos } -// Apply writes Annotations into the pod's annotations, deleting keys -// whose value is nil. Empty message clears the annotation so a stale -// failure reason cannot tail into the next lifecycle. +// Apply writes the status onto pod annotations. Empty message clears +// the annotation so a stale failure reason cannot tail into the next +// lifecycle. func (s LifecycleStatus) Apply(pod *corev1.Pod) { if pod == nil { return } a := ensurePodAnnotations(pod) - for key, val := range s.Annotations() { - if val == nil { - delete(a, key) - continue - } - a[key] = val.(string) + a[AnnotationLifecycleState] = string(s.State) + a[AnnotationLifecycleObservedGeneration] = strconv.FormatInt(s.ObservedGeneration, 10) + if s.Message == "" { + delete(a, AnnotationLifecycleStateMessage) + } else { + a[AnnotationLifecycleStateMessage] = s.Message } } @@ -102,15 +95,14 @@ func ReadLifecycleState(pod *corev1.Pod) LifecycleState { return LifecycleState(pod.Annotations[AnnotationLifecycleState]) } -// ReadLifecycleObservedGeneration reads the observed-generation annotation. -// Missing or unparseable returns 0 — callers treat it as "not observed yet". +// ReadLifecycleObservedGeneration reads the observed-generation +// annotation; missing or unparseable returns 0. func ReadLifecycleObservedGeneration(pod *corev1.Pod) int64 { return readInt64Annotation(pod, AnnotationLifecycleObservedGeneration) } -// ReadCocoonSetGeneration reads the CocoonSet generation stamped by -// cocoon-operator. vk-cocoon writes it back as observed-generation — -// counter-based completion is not subject to wallclock skew. +// ReadCocoonSetGeneration reads the CocoonSet generation stamped on the +// pod by cocoon-operator. func ReadCocoonSetGeneration(pod *corev1.Pod) int64 { return readInt64Annotation(pod, AnnotationCocoonSetGeneration) } @@ -124,8 +116,6 @@ func StampCocoonSetGeneration(pod *corev1.Pod, generation int64) { a[AnnotationCocoonSetGeneration] = strconv.FormatInt(generation, 10) } -// readInt64Annotation parses an int64-valued annotation, returning 0 -// when missing or unparseable. func readInt64Annotation(pod *corev1.Pod, key string) int64 { if pod == nil { return 0 From 27035ab6b53edc5915b77013c5c2d4d459ee06f3 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 17:04:10 +0800 Subject: [PATCH 17/20] fix(meta): InferRoleFromAgentSlot returns RoleToolbox for slot<0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this defensive branch, the suggested migration chain InferRoleFromAgentSlot(ExtractAgentSlot(ns, cs, vmName)) misclassified toolbox VM names as RoleSubAgent — ExtractAgentSlot returns -1 for non-agent names but the legacy slot != 0 fall-through still hit RoleSubAgent. Also drop the unused MainAgentVMName helper. --- meta/meta_test.go | 11 +++-------- meta/vmname.go | 26 +++++++++----------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/meta/meta_test.go b/meta/meta_test.go index 0a30035..87c5471 100644 --- a/meta/meta_test.go +++ b/meta/meta_test.go @@ -20,14 +20,6 @@ func TestVMNamingHelpers(t *testing.T) { if got := ExtractSlotFromVMName("vk-prod-toolbox"); got != -1 { t.Fatalf("expected non-slot vm name to return -1, got %d", got) } - if got := MainAgentVMName("vk-prod-demo-2"); got != "vk-prod-demo-0" { - t.Fatalf("main agent name mismatch: got %q", got) - } - // A pod-style name (no slot suffix) must be returned unchanged — - // the trailing dash inside the name is not a slot separator. - if got := MainAgentVMName("vk-prod-toolbox"); got != "vk-prod-toolbox" { - t.Fatalf("MainAgentVMName must not coerce non-slot names, got %q", got) - } } func TestInferRoleFromVMName(t *testing.T) { @@ -123,6 +115,9 @@ func TestInferRoleFromAgentSlot(t *testing.T) { if got := InferRoleFromAgentSlot(7); got != RoleSubAgent { t.Errorf("slot 7 = %q, want %q", got, RoleSubAgent) } + if got := InferRoleFromAgentSlot(-1); got != RoleToolbox { + t.Errorf("slot -1 = %q, want %q", got, RoleToolbox) + } } func TestConnectionType(t *testing.T) { diff --git a/meta/vmname.go b/meta/vmname.go index cbdbbbd..818f90b 100644 --- a/meta/vmname.go +++ b/meta/vmname.go @@ -43,13 +43,18 @@ func MainAgentVMNameFor(namespace, cocoonSet string) string { return VMNameForDeployment(namespace, cocoonSet, 0) } -// InferRoleFromAgentSlot returns RoleMain for slot 0, RoleSubAgent -// otherwise. Toolboxes use RoleToolbox; do not route them through here. +// InferRoleFromAgentSlot returns RoleMain for slot 0, RoleSubAgent for +// positive slots, RoleToolbox for slot < 0 (so the chain with +// ExtractAgentSlot stays safe for toolbox VM names). func InferRoleFromAgentSlot(slot int) string { - if slot == 0 { + switch { + case slot < 0: + return RoleToolbox + case slot == 0: return RoleMain + default: + return RoleSubAgent } - return RoleSubAgent } // ExtractSlotFromVMName parses the trailing slot index from a VM name, @@ -69,19 +74,6 @@ func ExtractSlotFromVMName(vmName string) int { return n } -// MainAgentVMName replaces the slot suffix with 0. Non-slot names are -// returned unchanged. -// -// Deprecated: shares the toolbox-collision bug of ExtractSlotFromVMName. -// Prefer MainAgentVMNameFor(namespace, cocoonSet) for new code. -func MainAgentVMName(vmName string) string { - if ExtractSlotFromVMName(vmName) < 0 { - return vmName - } - before, _, _ := lastCut(vmName, "-") - return before + "-0" -} - // InferRoleFromVMName returns RoleMain for slot 0, RoleSubAgent otherwise. // // Deprecated: shares the toolbox-collision bug of ExtractSlotFromVMName. From 4714e5c2d436244b5762356d0b7059c370420088 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 17:18:27 +0800 Subject: [PATCH 18/20] fix(auth): reject session at Exp == now (JWT-style) --- auth/session.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auth/session.go b/auth/session.go index 1609044..0fe4dc0 100644 --- a/auth/session.go +++ b/auth/session.go @@ -53,7 +53,7 @@ func VerifySession(cookie string, key []byte) (*Session, bool) { if json.Unmarshal(data, sess) != nil { return nil, false } - if sess.Exp != 0 && sess.Exp < time.Now().Unix() { + if sess.Exp != 0 && sess.Exp <= time.Now().Unix() { return nil, false } return sess, true From e451b5e185bed10d9d9c2245de9cbcd82fadb1d9 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 17:18:27 +0800 Subject: [PATCH 19/20] fix(k8s): surface partial TLS keypair misconfig as error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, tryLoadDiskCert silently fell back to self-signed for any fs.ErrNotExist from LoadX509KeyPair — including the asymmetric case where the cert exists but the key is missing (or vice versa). Stat the cert path explicitly so only "cert path missing" triggers the fallback; other errors (e.g. key missing while cert exists, permission denied) propagate so operators can detect the misconfiguration. --- k8s/tls.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/k8s/tls.go b/k8s/tls.go index ef16bbe..b43a3af 100644 --- a/k8s/tls.go +++ b/k8s/tls.go @@ -14,6 +14,7 @@ import ( "io/fs" "math/big" "net" + "os" "time" "github.com/projecteru2/core/log" @@ -41,16 +42,21 @@ func LoadOrGenerateCert(ctx context.Context, certPath, keyPath, hostname, ip str // tryLoadDiskCert returns ("", "", nil) when the caller should fall // back to self-signed (paths empty, cert missing, or expired) and an -// error only when a configured keypair fails to load. +// error only when a configured keypair fails to load. A missing key +// while the cert is present is a misconfiguration and surfaces as an +// error rather than silently switching to self-signed. func tryLoadDiskCert(ctx context.Context, certPath, keyPath string) (tls.Certificate, string, error) { if certPath == "" || keyPath == "" { return tls.Certificate{}, "", nil } - cert, err := tls.LoadX509KeyPair(certPath, keyPath) - if err != nil { + if _, err := os.Stat(certPath); err != nil { if errors.Is(err, fs.ErrNotExist) { return tls.Certificate{}, "", nil } + return tls.Certificate{}, "", fmt.Errorf("stat tls cert %s: %w", certPath, err) + } + cert, err := tls.LoadX509KeyPair(certPath, keyPath) + if err != nil { return tls.Certificate{}, "", fmt.Errorf("load tls keypair %s: %w", certPath, err) } if isCertExpired(ctx, cert, certPath) { From d15dd3c77f52f34ce2244211006f19a9e127aa45 Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 13 May 2026 17:19:37 +0800 Subject: [PATCH 20/20] style: trim verbose comments added in prior fixes --- k8s/tls.go | 4 +--- meta/vmname.go | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/k8s/tls.go b/k8s/tls.go index b43a3af..86e51fa 100644 --- a/k8s/tls.go +++ b/k8s/tls.go @@ -42,9 +42,7 @@ func LoadOrGenerateCert(ctx context.Context, certPath, keyPath, hostname, ip str // tryLoadDiskCert returns ("", "", nil) when the caller should fall // back to self-signed (paths empty, cert missing, or expired) and an -// error only when a configured keypair fails to load. A missing key -// while the cert is present is a misconfiguration and surfaces as an -// error rather than silently switching to self-signed. +// error only when a configured keypair fails to load. func tryLoadDiskCert(ctx context.Context, certPath, keyPath string) (tls.Certificate, string, error) { if certPath == "" || keyPath == "" { return tls.Certificate{}, "", nil diff --git a/meta/vmname.go b/meta/vmname.go index 818f90b..0ca58ad 100644 --- a/meta/vmname.go +++ b/meta/vmname.go @@ -44,8 +44,7 @@ func MainAgentVMNameFor(namespace, cocoonSet string) string { } // InferRoleFromAgentSlot returns RoleMain for slot 0, RoleSubAgent for -// positive slots, RoleToolbox for slot < 0 (so the chain with -// ExtractAgentSlot stays safe for toolbox VM names). +// positive slots, RoleToolbox for slot < 0. func InferRoleFromAgentSlot(slot int) string { switch { case slot < 0: