diff --git a/.gitignore b/.gitignore index 93e85512..d0202079 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ tmp/* .claude/* cocoon-test + +# Local scratch / test scripts (not for upstream) +/scratch/ diff --git a/KNOWN_ISSUES.md b/KNOWN_ISSUES.md index dc5014e7..b7a8cfc9 100644 --- a/KNOWN_ISSUES.md +++ b/KNOWN_ISSUES.md @@ -242,6 +242,23 @@ Mitigations for production users: Control-plane traffic from cocoon-managed hosts (vk-cocoon, `cocoon vm exec`) goes through cocoon-agent over vsock and never depends on SSH credentials. +## NIC hot-remove leaves the PCI slot pending on Cloud Hypervisor + +`cocoon vm net --nics N` tears down host TAP/veth/CNI plumbing immediately after `vm.remove-device` returns. CH only frees the PCI slot (and unregisters the device's ioeventfds) when the guest writes to the ACPI hot-plug controller (B0EJ) in response to the SCI raised by `remove-device`. If the guest is stopped, paused, or its NIC driver is wedged, the slot stays `Allocated` until the guest cooperates. + +Cocoon does not wait for B0EJ — there is no reliable signal from the CH HTTP API. The user accepts: + +- The guest may continue to reference a NIC whose host plumbing is gone, hanging the in-guest driver. +- The pending eject may surface later as `Cannot register ioevent: File exists` when CH next tries to reuse that slot (e.g. a subsequent hot-add). + +Quiesce the guest NIC (ip link set down + NetworkManager remove on Linux; Disable-PnpDevice + driver unbind on Windows) before reducing the count. + +Firecracker is not supported: FC has no NIC hot-plug / hot-unplug API. + +## NIC hot-remove during clone hot-swap cannot wait for guest B0EJ + +Plain `cocoon vm net --nics N` polls CH's `device_tree` after `vm.remove-device` until the guest ACKs B0EJ via ACPI/SCI (typically < 1 s on Linux, a few seconds on Windows). The clone path's `hotSwapNets` runs while the VM is paused, so the guest cannot process SCI — eject stays pending until resume. The clone path therefore returns without waiting and adds the fresh NICs against half-removed device-tree state. This is the long-standing CH limitation cocoon was designed around; the new wait only applies to the running-VM resize. + ## Android cocoon-agent service may be blocked by SELinux `os-image/android/{14.0,15.0}` install the cocoon-agent binary at `/system/bin/cocoon-agent` and register it via `/system/etc/init/cocoon-agent.rc`. Android's SELinux policies don't ship with a domain for cocoon-agent, so the service may run in `init`'s domain or be denied outright depending on the redroid build. diff --git a/README.md b/README.md index 9d9ff369..aa2e68ba 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,7 @@ cocoon │ ├── device │ │ ├── attach [flags] VM Attach a VFIO PCI device (CH only) │ │ └── detach [flags] VM Detach a VFIO PCI device by --id +│ ├── net [flags] VM Resize NIC count on a running VM (CH only) │ └── debug [flags] IMAGE Generate hypervisor launch command (dry run) ├── snapshot │ ├── save [flags] VM Create a snapshot from a running VM @@ -520,6 +521,22 @@ cocoon vm device detach my-vm --id mygpu `cocoon vm inspect VM` includes an `attached_devices` field for running VMs that surfaces every attached vhost-user-fs share and VFIO device, read live from CH `vm.info`. The field is omitted for stopped VMs. +## NIC Hot-Resize (Cloud Hypervisor only) + +`cocoon vm net --nics N VM` brings the running VM's NIC count to `N`. To add NICs, cocoon allocates new host TAP/CNI/bridge plumbing and hot-plugs a fresh NIC into the guest. To remove NICs, it pops from the tail (LIFO) via `vm.remove-device` and tears down the host plumbing. + +```bash +# Add a second NIC (or any number). +cocoon vm net my-vm --nics 2 + +# Remove a NIC (LIFO from the tail). +cocoon vm net my-vm --nics 1 +``` + +Cocoon manages **host-side** plumbing only. CH's `vm.remove-device` marks the slot for ejection but the actual eject only happens when the guest cooperates via ACPI (B0EJ write). The host TAP / veth / CNI lease are torn down immediately after the API call regardless. Quiesce in-guest NIC state (driver unbind, NetworkManager removal, Windows NDIS halt) **before** reducing the count, or the in-guest driver will reference plumbing that no longer exists. + +A VM started with zero NICs cannot be resized up (the VM record carries no provider hint). Start with at least one NIC if you plan to resize. + ## Windows Support Cocoon supports Windows guests via the `--windows` flag: diff --git a/cmd/vm/attach.go b/cmd/vm/attach.go index 8cc1d03d..d96368a2 100644 --- a/cmd/vm/attach.go +++ b/cmd/vm/attach.go @@ -9,13 +9,14 @@ import ( "github.com/spf13/cobra" cmdcore "github.com/cocoonstack/cocoon/cmd/core" + "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/extend/fs" "github.com/cocoonstack/cocoon/extend/vfio" "github.com/cocoonstack/cocoon/hypervisor" ) func (h Handler) FsAttach(cmd *cobra.Command, args []string) error { - ctx, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs attach", fs.ErrUnsupportedBackend) + ctx, _, _, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs attach", fs.ErrUnsupportedBackend) if err != nil { return err } @@ -35,7 +36,7 @@ func (h Handler) FsAttach(cmd *cobra.Command, args []string) error { } func (h Handler) FsDetach(cmd *cobra.Command, args []string) error { - ctx, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs detach", fs.ErrUnsupportedBackend) + ctx, _, _, a, err := resolveAttacher[fs.Attacher](h, cmd, args, "fs detach", fs.ErrUnsupportedBackend) if err != nil { return err } @@ -51,7 +52,7 @@ func (h Handler) FsDetach(cmd *cobra.Command, args []string) error { } func (h Handler) DeviceAttach(cmd *cobra.Command, args []string) error { - ctx, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device attach", vfio.ErrUnsupportedBackend) + ctx, _, _, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device attach", vfio.ErrUnsupportedBackend) if err != nil { return err } @@ -69,7 +70,7 @@ func (h Handler) DeviceAttach(cmd *cobra.Command, args []string) error { } func (h Handler) DeviceDetach(cmd *cobra.Command, args []string) error { - ctx, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device detach", vfio.ErrUnsupportedBackend) + ctx, _, _, a, err := resolveAttacher[vfio.Attacher](h, cmd, args, "device detach", vfio.ErrUnsupportedBackend) if err != nil { return err } @@ -84,24 +85,22 @@ func (h Handler) DeviceDetach(cmd *cobra.Command, args []string) error { return nil } -// resolveAttacher resolves args[0] to a hypervisor implementing A -// (fs.Attacher / vfio.Attacher). op ("fs attach", "device detach") prefixes -// both error wraps so callers see the operation, not just the type-assert. -func resolveAttacher[A any](h Handler, cmd *cobra.Command, args []string, op string, errUnsupported error) (context.Context, A, error) { +// resolveAttacher resolves args[0] to a hypervisor implementing A; also returns conf+hyper for further ops. +func resolveAttacher[A any](h Handler, cmd *cobra.Command, args []string, op string, errUnsupported error) (context.Context, *config.Config, hypervisor.Hypervisor, A, error) { var zero A ctx, conf, err := h.Init(cmd) if err != nil { - return ctx, zero, err + return ctx, nil, nil, zero, err } hyper, err := cmdcore.FindHypervisor(ctx, conf, args[0]) if err != nil { - return ctx, zero, fmt.Errorf("%s: %w", op, err) + return ctx, conf, nil, zero, fmt.Errorf("%s: %w", op, err) } a, ok := hyper.(A) if !ok { - return ctx, zero, fmt.Errorf("%s: backend %s: %w", op, hyper.Type(), errUnsupported) + return ctx, conf, hyper, zero, fmt.Errorf("%s: backend %s: %w", op, hyper.Type(), errUnsupported) } - return ctx, a, nil + return ctx, conf, hyper, a, nil } // classifyAttachErr surfaces ErrNotRunning more clearly than the generic wrap. diff --git a/cmd/vm/commands.go b/cmd/vm/commands.go index b5809f1a..036b0dac 100644 --- a/cmd/vm/commands.go +++ b/cmd/vm/commands.go @@ -28,6 +28,7 @@ type Actions interface { FsDetach(cmd *cobra.Command, args []string) error DeviceAttach(cmd *cobra.Command, args []string) error DeviceDetach(cmd *cobra.Command, args []string) error + NetResize(cmd *cobra.Command, args []string) error } // Command builds the "vm" parent command with all subcommands. @@ -188,10 +189,24 @@ func Command(h Actions) *cobra.Command { statusCmd, buildFsCommand(h), buildDeviceCommand(h), + buildNetCommand(h), ) return vmCmd } +func buildNetCommand(h Actions) *cobra.Command { + cmd := &cobra.Command{ + Use: "net VM", + Short: "Resize a running VM's NIC count (CH only); quiesce in-guest NIC state before reducing", + Args: cobra.ExactArgs(1), + RunE: h.NetResize, + } + cmd.Flags().Int("nics", -1, "target NIC count (required, >= 0)") + _ = cmd.MarkFlagRequired("nics") + cmdcore.AddOutputFlag(cmd) + return cmd +} + func buildFsCommand(h Actions) *cobra.Command { parent := &cobra.Command{ Use: "fs", diff --git a/cmd/vm/lifecycle.go b/cmd/vm/lifecycle.go index f1a52bcb..9e84aca3 100644 --- a/cmd/vm/lifecycle.go +++ b/cmd/vm/lifecycle.go @@ -386,20 +386,20 @@ func (h Handler) recoverNetwork(ctx context.Context, conf *config.Config, hyper continue } logger.Warnf(ctx, "network missing for VM %s, recovering", vm.ID) - if _, recoverErr := netProvider.Config(ctx, vm.ID, len(vm.NetworkConfigs), &vm.Config, vm.NetworkConfigs...); recoverErr != nil { + if _, recoverErr := netProvider.Add(ctx, vm.ID, &vm.Config, network.AddRecover(vm.NetworkConfigs)...); recoverErr != nil { logger.Warnf(ctx, "recover network for VM %s: %v (start will fail)", vm.ID, recoverErr) } } } -// providerForVM selects network provider from persisted NetworkConfig. +// providerForVM picks the network provider from persisted NetworkConfig; cniProvider may be nil (lazy-init), bridgeCache must be non-nil. func providerForVM(conf *config.Config, cniProvider network.Network, bridgeCache map[string]network.Network, configs []*types.NetworkConfig) (network.Network, error) { if len(configs) == 0 { return nil, fmt.Errorf("no network configs") } // All NICs on a VM share the same backend. cfg := configs[0] - if cfg.Backend == "bridge" { + if cfg.Backend == types.BackendBridge { if cfg.BridgeDev == "" { return nil, fmt.Errorf("bridge backend but no bridge device persisted") } @@ -414,10 +414,10 @@ func providerForVM(conf *config.Config, cniProvider network.Network, bridgeCache return p, nil } // "cni" or empty (backward compat). - if cniProvider == nil { - return nil, fmt.Errorf("cni provider not available") + if cniProvider != nil { + return cniProvider, nil } - return cniProvider, nil + return cmdcore.InitNetwork(conf) } func batchRoutedCmd(ctx context.Context, cmd *cobra.Command, name, pastTense string, routed map[hypervisor.Hypervisor][]string, fn func(hypervisor.Hypervisor, []string) ([]string, error)) error { diff --git a/cmd/vm/netresize.go b/cmd/vm/netresize.go new file mode 100644 index 00000000..aa1fab8a --- /dev/null +++ b/cmd/vm/netresize.go @@ -0,0 +1,52 @@ +package vm + +import ( + "fmt" + + "github.com/projecteru2/core/log" + "github.com/spf13/cobra" + + cmdcore "github.com/cocoonstack/cocoon/cmd/core" + "github.com/cocoonstack/cocoon/config" + "github.com/cocoonstack/cocoon/extend/netresize" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/types" +) + +func (h Handler) NetResize(cmd *cobra.Command, args []string) error { + ctx, conf, hyper, resizer, err := resolveAttacher[netresize.Resizer](h, cmd, args, "vm net", netresize.ErrUnsupportedBackend) + if err != nil { + return err + } + vm, err := hyper.Inspect(ctx, args[0]) + if err != nil { + return fmt.Errorf("vm net: %w", err) + } + plumbing, err := plumbingForVM(conf, vm.NetworkConfigs) + if err != nil { + return fmt.Errorf("vm net: %w", err) + } + target, _ := cmd.Flags().GetInt("nics") + res, err := resizer.NetResize(ctx, args[0], netresize.Spec{Target: target}, plumbing) + if err != nil { + return classifyAttachErr(err) + } + if done, jsonErr := cmdcore.MaybeOutputJSON(cmd, res); done { + return jsonErr + } + logger := log.WithFunc("cmd.vm.net") + logger.Infof(ctx, "resized %s: before=%d after=%d added=%d removed=%d", + args[0], res.Before, res.After, len(res.Added), len(res.Removed)) + for _, w := range res.Warnings { + logger.Warnf(ctx, "%s: %s", args[0], w) + } + return nil +} + +// plumbingForVM picks the provider for the VM's existing NICs; fails on zero NICs (VMConfig has no bridge hint). +func plumbingForVM(conf *config.Config, configs []*types.NetworkConfig) (network.Network, error) { + if len(configs) == 0 { + return nil, fmt.Errorf("vm has zero NICs; resize up is not supported (start the VM with at least one NIC)") + } + return providerForVM(conf, nil, map[string]network.Network{}, configs) +} diff --git a/cmd/vm/run.go b/cmd/vm/run.go index b9419fa9..8c4b955d 100644 --- a/cmd/vm/run.go +++ b/cmd/vm/run.go @@ -480,7 +480,7 @@ func initNetwork(ctx context.Context, conf *config.Config, vmID string, nics int // The network layer derives TAP queues from vmCfg.CPU. origCPU := vmCfg.CPU vmCfg.CPU = queues - configs, err := netProvider.Config(ctx, vmID, nics, vmCfg) + configs, err := netProvider.Add(ctx, vmID, vmCfg, network.AddRange(0, nics)...) vmCfg.CPU = origCPU if err != nil { return nil, nil, fmt.Errorf("configure network: %w", err) diff --git a/extend/netresize/netresize.go b/extend/netresize/netresize.go new file mode 100644 index 00000000..4433d0ad --- /dev/null +++ b/extend/netresize/netresize.go @@ -0,0 +1,54 @@ +// Package netresize is the runtime interface for resizing a VM's NIC count. +package netresize + +import ( + "context" + "errors" + "fmt" + + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/types" +) + +// ErrUnsupportedBackend signals the resolved hypervisor cannot resize NICs. +var ErrUnsupportedBackend = errors.New("backend does not support net resize") + +// Spec is one resize request. +type Spec struct { + Target int +} + +// NIC is one NIC summary surfaced through Result.Added / Result.Removed. +type NIC struct { + Index int `json:"index"` + TAP string `json:"tap"` + MAC string `json:"mac"` +} + +// Result reports the before/after count, NICs touched, and non-fatal Warnings. +type Result struct { + Before int `json:"before"` + After int `json:"after"` + Added []NIC `json:"added,omitempty"` + Removed []NIC `json:"removed,omitempty"` + Warnings []string `json:"warnings,omitempty"` +} + +// Plumbing is the host-side network ops NetResize delegates to; network.Network satisfies it. +type Plumbing interface { + Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, specs ...network.AddSpec) ([]*types.NetworkConfig, error) + Remove(ctx context.Context, vmID string, indices ...int) error +} + +// Resizer resizes a running VM's NIC count. +type Resizer interface { + NetResize(ctx context.Context, vmRef string, spec Spec, plumbing Plumbing) (Result, error) +} + +// Normalize validates the spec. +func (s *Spec) Normalize() error { + if s.Target < 0 { + return fmt.Errorf("--nics must be non-negative, got %d", s.Target) + } + return nil +} diff --git a/hypervisor/cloudhypervisor/api.go b/hypervisor/cloudhypervisor/api.go index 8a997353..a03dee2e 100644 --- a/hypervisor/cloudhypervisor/api.go +++ b/hypervisor/cloudhypervisor/api.go @@ -1,5 +1,7 @@ package cloudhypervisor +import "encoding/json" + type chVMConfig struct { // Optional — pointer + omitempty (nil → omitted from JSON). Payload *chPayload `json:"payload,omitempty"` @@ -110,7 +112,8 @@ type chPciDeviceInfo struct { } type chVMInfoResponse struct { - Config chVMInfoConfig `json:"config"` + Config chVMInfoConfig `json:"config"` + DeviceTree map[string]json.RawMessage `json:"device_tree,omitempty"` } type chVMInfoConfig struct { @@ -119,4 +122,5 @@ type chVMInfoConfig struct { Memory chMemory `json:"memory"` Fs []chFs `json:"fs,omitempty"` Devices []chDevice `json:"devices,omitempty"` + Nets []chNet `json:"net,omitempty"` } diff --git a/hypervisor/cloudhypervisor/args.go b/hypervisor/cloudhypervisor/args.go index c31de8ef..723d4b37 100644 --- a/hypervisor/cloudhypervisor/args.go +++ b/hypervisor/cloudhypervisor/args.go @@ -17,6 +17,8 @@ const ( // defaultDiskQueueSize is the virtio-blk queue depth per device. defaultDiskQueueSize = 512 cidataFile = "cidata.img" + + cocoonNetIDPrefix = "cocoon-net-" ) // kvBuilder accumulates key=value CLI fragments. @@ -179,6 +181,11 @@ func networkConfigToNet(nc *types.NetworkConfig) chNet { } } +// cocoonNetID is the deterministic CH device id for a cocoon-managed NIC. +func cocoonNetID(mac string) string { + return cocoonNetIDPrefix + strings.ReplaceAll(mac, ":", "") +} + func storageConfigToDisk(storageConfig *types.StorageConfig, cpuCount, diskQueueSize int, noDirectIO bool) chDisk { if diskQueueSize <= 0 { diskQueueSize = defaultDiskQueueSize diff --git a/hypervisor/cloudhypervisor/clone.go b/hypervisor/cloudhypervisor/clone.go index 3e04786f..51807d49 100644 --- a/hypervisor/cloudhypervisor/clone.go +++ b/hypervisor/cloudhypervisor/clone.go @@ -320,8 +320,7 @@ func hotSwapNets(ctx context.Context, hc *http.Client, oldNets []chNet, networkC logger.Infof(ctx, "removed snapshot NIC %s (old MAC %s)", oldNet.ID, oldNet.MAC) } for i, nc := range networkConfigs { - newNet := networkConfigToNet(nc) - if err := addNetVM(ctx, hc, newNet); err != nil { + if _, err := addCocoonNIC(ctx, hc, nc); err != nil { return fmt.Errorf("add net device %d/%d (MAC %s, TAP %s): %w", i+1, len(networkConfigs), nc.MAC, nc.TAP, err) } diff --git a/hypervisor/cloudhypervisor/extend.go b/hypervisor/cloudhypervisor/extend.go index f1e37ff4..229825b0 100644 --- a/hypervisor/cloudhypervisor/extend.go +++ b/hypervisor/cloudhypervisor/extend.go @@ -10,6 +10,7 @@ import ( "strings" "github.com/cocoonstack/cocoon/extend/fs" + "github.com/cocoonstack/cocoon/extend/netresize" "github.com/cocoonstack/cocoon/extend/vfio" "github.com/cocoonstack/cocoon/hypervisor" "github.com/cocoonstack/cocoon/types" @@ -17,10 +18,11 @@ import ( ) var ( - _ fs.Attacher = (*CloudHypervisor)(nil) - _ fs.Lister = (*CloudHypervisor)(nil) - _ vfio.Attacher = (*CloudHypervisor)(nil) - _ vfio.Lister = (*CloudHypervisor)(nil) + _ fs.Attacher = (*CloudHypervisor)(nil) + _ fs.Lister = (*CloudHypervisor)(nil) + _ vfio.Attacher = (*CloudHypervisor)(nil) + _ vfio.Lister = (*CloudHypervisor)(nil) + _ netresize.Resizer = (*CloudHypervisor)(nil) ) // FsAttach hot-plugs a vhost-user-fs device onto a running CH VM. @@ -214,26 +216,32 @@ func (ch *CloudHypervisor) detachWith( // alive (PID file + cmdline match — same gate as Backend.WithRunningVM), // and returns an http.Client connected to its CH API socket. func (ch *CloudHypervisor) runningVMClient(ctx context.Context, vmRef string) (*http.Client, error) { + hc, _, _, err := ch.runningVMClientWithRecord(ctx, vmRef) + return hc, err +} + +// runningVMClientWithRecord is runningVMClient + the resolved vmID and record. +func (ch *CloudHypervisor) runningVMClientWithRecord(ctx context.Context, vmRef string) (*http.Client, string, hypervisor.VMRecord, error) { vmID, err := ch.ResolveRef(ctx, vmRef) if err != nil { - return nil, err + return nil, "", hypervisor.VMRecord{}, err } rec, err := ch.LoadRecord(ctx, vmID) if err != nil { - return nil, err + return nil, "", hypervisor.VMRecord{}, err } if rec.State != types.VMStateRunning { - return nil, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) + return nil, "", hypervisor.VMRecord{}, fmt.Errorf("vm %s is %s: %w", vmID, rec.State, hypervisor.ErrNotRunning) } sockPath := hypervisor.SocketPath(rec.RunDir) pid, pidErr := utils.ReadPIDFile(ch.PIDFilePath(rec.RunDir)) if pidErr != nil { - return nil, fmt.Errorf("vm %s read pidfile: %w: %w", vmID, pidErr, hypervisor.ErrNotRunning) + return nil, "", hypervisor.VMRecord{}, fmt.Errorf("vm %s read pidfile: %w: %w", vmID, pidErr, hypervisor.ErrNotRunning) } if !utils.VerifyProcessCmdline(pid, ch.conf.BinaryName(), sockPath) { - return nil, fmt.Errorf("vm %s pid %d not %s: %w", vmID, pid, ch.conf.BinaryName(), hypervisor.ErrNotRunning) + return nil, "", hypervisor.VMRecord{}, fmt.Errorf("vm %s pid %d not %s: %w", vmID, pid, ch.conf.BinaryName(), hypervisor.ErrNotRunning) } - return utils.NewSocketHTTPClient(sockPath), nil + return utils.NewSocketHTTPClient(sockPath), vmID, rec, nil } // listWith is the shared skeleton for inspect-time enumeration. diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 59f9643a..f0cee3c8 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/projecteru2/core/log" @@ -180,10 +181,35 @@ func removeDeviceVM(ctx context.Context, hc *http.Client, deviceID string) error return vmPutJSON(ctx, hc, "vm.remove-device", "remove-device request", map[string]string{"id": deviceID}) } +// waitDeviceEjected blocks until id is gone from CH's device_tree. +func waitDeviceEjected(ctx context.Context, hc *http.Client, deviceID string, timeout time.Duration) error { + return utils.WaitFor(ctx, timeout, 100*time.Millisecond, func() (bool, error) { + info, err := getVMInfo(ctx, hc) + if err != nil { + return false, err + } + _, present := info.DeviceTree[deviceID] + return !present, nil + }) +} + func addNetVM(ctx context.Context, hc *http.Client, net chNet) error { return vmPutJSON(ctx, hc, "vm.add-net", "add-net request", net, http.StatusOK, http.StatusNoContent) } +// addCocoonNIC posts vm.add-net with the deterministic cocoon-net- id; returns id for rollback. +func addCocoonNIC(ctx context.Context, hc *http.Client, nc *types.NetworkConfig) (string, error) { + if nc == nil { + return "", fmt.Errorf("addCocoonNIC: nil network config") + } + chN := networkConfigToNet(nc) + chN.ID = cocoonNetID(nc.MAC) + if err := addNetVM(ctx, hc, chN); err != nil { + return "", err + } + return chN.ID, nil +} + // getVMInfo fetches vm.info; cocoon uses it to detect tag/id conflicts // before hot-add and to surface attached devices through inspect. func getVMInfo(ctx context.Context, hc *http.Client) (*chVMInfoResponse, error) { diff --git a/hypervisor/cloudhypervisor/netresize.go b/hypervisor/cloudhypervisor/netresize.go new file mode 100644 index 00000000..09649d8c --- /dev/null +++ b/hypervisor/cloudhypervisor/netresize.go @@ -0,0 +1,143 @@ +package cloudhypervisor + +import ( + "context" + "fmt" + "net/http" + "strings" + "time" + + "github.com/projecteru2/core/log" + + "github.com/cocoonstack/cocoon/extend/netresize" + "github.com/cocoonstack/cocoon/hypervisor" + "github.com/cocoonstack/cocoon/network" + "github.com/cocoonstack/cocoon/types" +) + +// ejectWaitTimeout bounds the wait for guest B0EJ; Linux acks < 1 s, Windows can take a few. +const ejectWaitTimeout = 10 * time.Second + +// NetResize brings the VM's NIC count to spec.Target on a running CH VM. +func (ch *CloudHypervisor) NetResize(ctx context.Context, vmRef string, spec netresize.Spec, plumbing netresize.Plumbing) (netresize.Result, error) { + if err := spec.Normalize(); err != nil { + return netresize.Result{}, err + } + hc, vmID, rec, err := ch.runningVMClientWithRecord(ctx, vmRef) + if err != nil { + return netresize.Result{}, err + } + current := len(rec.NetworkConfigs) + res := netresize.Result{Before: current, After: current} + switch { + case spec.Target == current: + return res, nil + case spec.Target > current: + return ch.netResizeAdd(ctx, hc, vmID, &rec.Config, plumbing, current, spec.Target, res) + default: + info, infoErr := getVMInfo(ctx, hc) + if infoErr != nil { + return res, infoErr + } + return ch.netResizeRemove(ctx, hc, info, vmID, rec.NetworkConfigs, plumbing, current, spec.Target, res) + } +} + +func (ch *CloudHypervisor) netResizeAdd(ctx context.Context, hc *http.Client, vmID string, vmCfg *types.VMConfig, plumbing netresize.Plumbing, from, target int, res netresize.Result) (netresize.Result, error) { + logger := log.WithFunc("cloudhypervisor.NetResize.add") + res.Added = make([]netresize.NIC, 0, target-from) + for i := from; i < target; i++ { + ncs, err := plumbing.Add(ctx, vmID, vmCfg, network.AddSpec{Index: i}) + if err != nil { + return res, fmt.Errorf("nic %d host plumbing: %w", i, err) + } + if len(ncs) != 1 || ncs[0] == nil { + return res, fmt.Errorf("nic %d: plumbing returned %d configs", i, len(ncs)) + } + nc := ncs[0] + chID, err := addCocoonNIC(ctx, hc, nc) + if err != nil { + if rmErr := plumbing.Remove(ctx, vmID, i); rmErr != nil { + logger.Warnf(ctx, "rollback host plumbing for nic %d: %v", i, rmErr) + } + return res, fmt.Errorf("vm.add-net nic %d: %w", i, err) + } + if err := ch.appendNetworkConfig(ctx, vmID, nc); err != nil { + // Wait for B0EJ before host teardown (symmetric with netResizeRemove). + if rmErr := removeDeviceVM(ctx, hc, chID); rmErr != nil { + logger.Warnf(ctx, "rollback vm.remove-device %s after persist failure: %v", chID, rmErr) + } else if wErr := waitDeviceEjected(ctx, hc, chID, ejectWaitTimeout); wErr != nil { + logger.Warnf(ctx, "rollback wait eject %s after persist failure: %v", chID, wErr) + } + if rmErr := plumbing.Remove(ctx, vmID, i); rmErr != nil { + logger.Warnf(ctx, "rollback host plumbing for nic %d: %v", i, rmErr) + } + return res, fmt.Errorf("persist nic %d: %w", i, err) + } + res.Added = append(res.Added, netresize.NIC{Index: i, TAP: nc.TAP, MAC: nc.MAC}) + res.After = i + 1 + } + return res, nil +} + +func (ch *CloudHypervisor) netResizeRemove(ctx context.Context, hc *http.Client, info *chVMInfoResponse, vmID string, ncs []*types.NetworkConfig, plumbing netresize.Plumbing, current, target int, res netresize.Result) (netresize.Result, error) { + logger := log.WithFunc("cloudhypervisor.NetResize.remove") + res.Removed = make([]netresize.NIC, 0, current-target) + macToID := make(map[string]string, len(info.Config.Nets)) + for _, n := range info.Config.Nets { + macToID[strings.ToLower(n.MAC)] = n.ID + } + for i := current - 1; i >= target; i-- { + nc := ncs[i] + if nc == nil { + return res, fmt.Errorf("nic %d: nil network config", i) + } + chID := macToID[strings.ToLower(nc.MAC)] + if chID == "" { + return res, fmt.Errorf("nic %d MAC %s: no live device", i, nc.MAC) + } + if err := removeDeviceVM(ctx, hc, chID); err != nil { + return res, fmt.Errorf("vm.remove-device nic %d (%s): %w", i, chID, err) + } + if err := waitDeviceEjected(ctx, hc, chID, ejectWaitTimeout); err != nil { + return res, fmt.Errorf("wait eject nic %d (%s): %w", i, chID, err) + } + plumbingErr := plumbing.Remove(ctx, vmID, i) + if err := ch.truncateNetworkConfigs(ctx, vmID, i); err != nil { + logger.Errorf(ctx, err, "persistence diverged from CH for vm %s nic %d (%s): live device removed, cocoon record retained", vmID, i, chID) + return res, fmt.Errorf("persist remove nic %d: %w", i, err) + } + if plumbingErr != nil { + msg := fmt.Sprintf("nic %d (%s) host plumbing leaked, cocoon vm rm or gc will reclaim: %v", i, chID, plumbingErr) + logger.Warn(ctx, msg) + res.Warnings = append(res.Warnings, msg) + } + res.Removed = append(res.Removed, netresize.NIC{Index: i, TAP: nc.TAP, MAC: nc.MAC}) + res.After = i + } + return res, nil +} + +func (ch *CloudHypervisor) appendNetworkConfig(ctx context.Context, vmID string, nc *types.NetworkConfig) error { + return ch.DB.Update(ctx, func(idx *hypervisor.VMIndex) error { + r, err := idx.GetRecord(vmID) + if err != nil { + return err + } + r.NetworkConfigs = append(r.NetworkConfigs, nc) + return nil + }) +} + +func (ch *CloudHypervisor) truncateNetworkConfigs(ctx context.Context, vmID string, length int) error { + return ch.DB.Update(ctx, func(idx *hypervisor.VMIndex) error { + r, err := idx.GetRecord(vmID) + if err != nil { + return err + } + if length < len(r.NetworkConfigs) { + r.NetworkConfigs = r.NetworkConfigs[:length] + } + return nil + }) +} diff --git a/network/bridge/bridge_linux.go b/network/bridge/bridge_linux.go index 15ca556e..6cab195d 100644 --- a/network/bridge/bridge_linux.go +++ b/network/bridge/bridge_linux.go @@ -67,53 +67,56 @@ func (b *Bridge) Verify(_ context.Context, vmID string) error { return nil } -// Config creates TAP devices and adds them to the bridge. -func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig, existing ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { - logger := log.WithFunc("bridge.Config") +// Add allocates TAP devices on the bridge for the given specs. +func (b *Bridge) Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, specs ...network.AddSpec) (configs []*types.NetworkConfig, retErr error) { + if len(specs) == 0 { + return nil, nil + } + logger := log.WithFunc("bridge.Add") br, err := netlink.LinkByIndex(b.bridgeIdx) if err != nil { return nil, fmt.Errorf("find bridge: %w", err) } - var configs []*types.NetworkConfig - for i := range numNICs { - name := tapName(vmID, i) - - var mac string - if i < len(existing) && existing[i] != nil { - mac = existing[i].MAC - } else { - mac = generateMAC() + added := make([]int, 0, len(specs)) + defer func() { + if retErr == nil || len(added) == 0 { + return + } + _ = tearDownTAPs(vmID, added, true) + }() + + configs = make([]*types.NetworkConfig, 0, len(specs)) + for _, spec := range specs { + name := tapName(vmID, spec.Index) + mac := generateMAC() + if spec.Existing != nil { + mac = spec.Existing.MAC } - queues := network.NetNumQueues(vmCfg.CPU) - if err := createTAP(name, queues); err != nil { - return nil, fmt.Errorf("create tap %s: %w", name, err) + if cErr := createTAP(name, queues); cErr != nil { + return nil, fmt.Errorf("create tap %s: %w", name, cErr) } + added = append(added, spec.Index) - tap, err := netlink.LinkByName(name) - if err != nil { - return nil, fmt.Errorf("find tap %s: %w", name, err) + tap, lErr := netlink.LinkByName(name) + if lErr != nil { + return nil, fmt.Errorf("find tap %s: %w", name, lErr) } - if err := netlink.LinkSetMaster(tap, br); err != nil { - _ = netlink.LinkDel(tap) - return nil, fmt.Errorf("add %s to %s: %w", name, b.bridgeDev, err) + if mErr := netlink.LinkSetMaster(tap, br); mErr != nil { + return nil, fmt.Errorf("add %s to %s: %w", name, b.bridgeDev, mErr) } - // Disable FDB source MAC learning per-packet overhead. _ = netlink.LinkSetLearning(tap, false) - if mtu := br.Attrs().MTU; mtu > 0 { _ = netlink.LinkSetMTU(tap, mtu) } - _ = network.TuneTAP(tap) - if err := netlink.LinkSetUp(tap); err != nil { - _ = netlink.LinkDel(tap) - return nil, fmt.Errorf("set %s up: %w", name, err) + if uErr := netlink.LinkSetUp(tap); uErr != nil { + return nil, fmt.Errorf("set %s up: %w", name, uErr) } configs = append(configs, &types.NetworkConfig{ @@ -121,14 +124,19 @@ func (b *Bridge) Config(ctx context.Context, vmID string, numNICs int, vmCfg *ty MAC: mac, NumQueues: queues, QueueSize: network.ResolveQueueSize(vmCfg.QueueSize), - Backend: typ, + Backend: types.BackendBridge, BridgeDev: b.bridgeDev, }) - logger.Debugf(ctx, "NIC %d: tap=%s mac=%s bridge=%s", i, name, mac, b.bridgeDev) + logger.Debugf(ctx, "NIC %d: tap=%s mac=%s bridge=%s", spec.Index, name, mac, b.bridgeDev) } return configs, nil } +// Remove deletes the TAP devices for the given indices. +func (b *Bridge) Remove(_ context.Context, vmID string, indices ...int) error { + return tearDownTAPs(vmID, indices, false) +} + // Delete removes TAP devices for the given VMs. func (b *Bridge) Delete(_ context.Context, vmIDs []string) ([]string, error) { return CleanupTAPs(vmIDs), nil @@ -149,25 +157,44 @@ func (b *Bridge) RegisterGC(orch *gc.Orchestrator) { gc.Register(orch, GCModule(b.conf.RootDir)) } -// CleanupTAPs removes bridge TAP devices for the given VM IDs. -// It does not require a Bridge instance and is safe to call -// even when no bridge TAPs exist (no-op per VM). +// CleanupTAPs probes and removes bridge TAP devices for the given VM IDs. +// No-op per VM if none exist; safe without a Bridge instance. func CleanupTAPs(vmIDs []string) []string { - var cleaned []string + cleaned := make([]string, 0, len(vmIDs)) for _, vmID := range vmIDs { + var indices []int for i := 0; ; i++ { - name := tapName(vmID, i) - l, err := netlink.LinkByName(name) - if err != nil { - break // no more TAPs for this VM + if _, err := netlink.LinkByName(tapName(vmID, i)); err != nil { + break } - _ = netlink.LinkDel(l) + indices = append(indices, i) } + _ = tearDownTAPs(vmID, indices, true) cleaned = append(cleaned, vmID) } return cleaned } +func tearDownTAPs(vmID string, indices []int, bestEffort bool) error { + for _, i := range indices { + name := tapName(vmID, i) + link, err := netlink.LinkByName(name) + if err != nil { + if bestEffort { + continue + } + return fmt.Errorf("find tap %s: %w", name, err) + } + if err := netlink.LinkDel(link); err != nil { + if bestEffort { + continue + } + return fmt.Errorf("delete tap %s: %w", name, err) + } + } + return nil +} + func createTAP(name string, numQueues int) error { // queue_pairs = num_queues / 2 (TX+RX pair per queue). // Multi-queue requires queue_pairs > 1. diff --git a/network/bridge/bridge_other.go b/network/bridge/bridge_other.go index fb4ba4ec..031cb2e6 100644 --- a/network/bridge/bridge_other.go +++ b/network/bridge/bridge_other.go @@ -9,6 +9,7 @@ import ( "github.com/cocoonstack/cocoon/config" "github.com/cocoonstack/cocoon/gc" + "github.com/cocoonstack/cocoon/network" "github.com/cocoonstack/cocoon/types" ) @@ -28,11 +29,14 @@ func (b *Bridge) Type() string { return "bridge" } // Verify is not supported. func (b *Bridge) Verify(_ context.Context, _ string) error { return errUnsupported } -// Config is not supported. -func (b *Bridge) Config(_ context.Context, _ string, _ int, _ *types.VMConfig, _ ...*types.NetworkConfig) ([]*types.NetworkConfig, error) { +// Add is not supported. +func (b *Bridge) Add(_ context.Context, _ string, _ *types.VMConfig, _ ...network.AddSpec) ([]*types.NetworkConfig, error) { return nil, errUnsupported } +// Remove is not supported. +func (b *Bridge) Remove(_ context.Context, _ string, _ ...int) error { return errUnsupported } + // Delete is not supported. func (b *Bridge) Delete(_ context.Context, _ []string) ([]string, error) { return nil, errUnsupported } diff --git a/network/cni/cni.go b/network/cni/cni.go index bcf4cd7e..e7e5df9b 100644 --- a/network/cni/cni.go +++ b/network/cni/cni.go @@ -38,10 +38,8 @@ type CNI struct { cniConf *libcni.CNIConfig } -// New creates a CNI network provider. -// CNI conflist loading is best-effort at creation time; if no conflist is -// available (e.g. no network needed), Delete/Inspect/List still work. -// Config() will fail if the conflist is not loaded. +// New creates a CNI provider; conflist loading is best-effort so Delete/Inspect/List +// still work when none are available — Add fails in that case. func New(conf *config.Config) (*CNI, error) { if conf == nil { return nil, fmt.Errorf("config is nil") @@ -113,13 +111,7 @@ func (c *CNI) List(ctx context.Context) ([]*types.Network, error) { }) } -// Delete removes all network resources for the given VM IDs: -// 1. CNI DEL for each NIC (releases IP from IPAM, removes veth pair). -// 2. Remove the named netns (kernel cleans up bridge + tap automatically). -// 3. Remove network records from the DB. -// -// Best-effort: failing to clean one VM does not block others. -// Returns the VM IDs that were fully cleaned. +// Delete tears down all NICs for each VM and removes the netns. Best-effort. func (c *CNI) Delete(ctx context.Context, vmIDs []string) ([]string, error) { result := utils.ForEach(ctx, vmIDs, func(ctx context.Context, vmID string) error { return c.deleteVM(ctx, vmID) @@ -127,9 +119,7 @@ func (c *CNI) Delete(ctx context.Context, vmIDs []string) ([]string, error) { return result.Succeeded, result.Err() } -// deleteVM cleans up all network resources for a single VM. func (c *CNI) deleteVM(ctx context.Context, vmID string) error { - // Collect value-copies of records for this VM. var records []networkRecord if err := c.store.With(ctx, func(idx *networkIndex) error { records = idx.byVMID(vmID) @@ -137,58 +127,72 @@ func (c *CNI) deleteVM(ctx context.Context, vmID string) error { }); err != nil { return fmt.Errorf("read network index: %w", err) } - - // Nothing to clean — VM had no network or was already cleaned. - if len(records) == 0 { - return nil - } - + // Run even when records is empty: a VM resized to 0 NICs still owns its netns. nsPath := netnsPath(vmID) - - // CNI DEL for each NIC — releases IPs from IPAM and removes veth pairs. - // Best-effort: log failures but continue. Netns deletion cleans up - // devices anyway; CNI DEL is primarily for IPAM bookkeeping. - c.delNICs(ctx, vmID, nsPath, records) - - // deleteNetns retries briefly to handle async fd cleanup after process kill. + _ = c.tearDownNICs(ctx, vmID, nsPath, records, false, true) nsName := netnsName(vmID) if err := deleteNetns(ctx, nsName); err != nil && !errors.Is(err, fs.ErrNotExist) { return fmt.Errorf("remove netns %s: %w", nsPath, err) } - - return c.store.Update(ctx, func(idx *networkIndex) error { - for id, rec := range idx.Networks { - if rec != nil && rec.VMID == vmID { - delete(idx.Networks, id) - } - } - return nil - }) + allIDs := make([]string, 0, len(records)) + for _, rec := range records { + allIDs = append(allIDs, rec.ID) + } + return c.deleteRecords(ctx, allIDs) } -// delNICs performs best-effort CNI DEL for each NIC record. -// Failures are logged but never returned — netns deletion cleans up -// devices anyway; CNI DEL is primarily for IPAM bookkeeping. -func (c *CNI) delNICs(ctx context.Context, vmID, nsPath string, records []networkRecord) { +// tearDownNICs runs CNI DEL (+ optional TAP delete) per record; bestEffort=false stops at first failure. Caller sweeps DB. +func (c *CNI) tearDownNICs(ctx context.Context, vmID, nsPath string, records []networkRecord, deleteTAP, bestEffort bool) error { + logger := log.WithFunc("cni.tearDownNICs") if c.cniConf == nil { - return + if !bestEffort { + return fmt.Errorf("%w: no conflist found in %s", network.ErrNotConfigured, c.conf.CNIConfDir) + } + return nil } - logger := log.WithFunc("cni.delNICs") for _, rec := range records { + var recErr error cl, err := c.confListByName(rec.Type) if err != nil { + recErr = err logger.Warnf(ctx, "conflist %q not found for CNI DEL %s/%s: %v", rec.Type, vmID, rec.IfName, err) - continue } - rt := &libcni.RuntimeConf{ - ContainerID: vmID, - NetNS: nsPath, - IfName: rec.IfName, + if cl != nil { + if err := c.cniDel(ctx, cl, vmID, nsPath, rec.IfName); err != nil { + if recErr == nil { + recErr = fmt.Errorf("cni del %s/%s: %w", vmID, rec.IfName, err) + } + logger.Warnf(ctx, "CNI DEL %s/%s: %v", vmID, rec.IfName, err) + } } - if err := c.cniConf.DelNetworkList(ctx, cl, rt); err != nil { - logger.Warnf(ctx, "CNI DEL %s/%s: %v", vmID, rec.IfName, err) + if deleteTAP { + var idx int + if _, scanErr := fmt.Sscanf(rec.IfName, "eth%d", &idx); scanErr != nil { + logger.Warnf(ctx, "parse ifname %q for %s: %v (skip tap delete)", rec.IfName, vmID, scanErr) + } else if delErr := deleteTAPInNetns(nsPath, tapNameForVM(vmID, idx)); delErr != nil { + if recErr == nil { + recErr = fmt.Errorf("delete tap %s: %w", tapNameForVM(vmID, idx), delErr) + } + logger.Warnf(ctx, "delete tap %s in netns %s: %v", tapNameForVM(vmID, idx), nsPath, delErr) + } } + if recErr != nil && !bestEffort { + return recErr + } + } + return nil +} + +func (c *CNI) deleteRecords(ctx context.Context, ids []string) error { + if len(ids) == 0 { + return nil } + return c.store.Update(ctx, func(idx *networkIndex) error { + for _, id := range ids { + delete(idx.Networks, id) + } + return nil + }) } // confListByName resolves a conflist by name. diff --git a/network/cni/create.go b/network/cni/create.go index 34d7cc87..08056c69 100644 --- a/network/cni/create.go +++ b/network/cni/create.go @@ -2,7 +2,10 @@ package cni import ( "context" + "errors" "fmt" + "io/fs" + "os" "github.com/containernetworking/cni/libcni" cnitypes "github.com/containernetworking/cni/pkg/types" @@ -14,91 +17,104 @@ import ( "github.com/cocoonstack/cocoon/utils" ) -// Config creates the netns, runs CNI ADD, and wires each NIC to a tap via TC. -func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig, existing ...*types.NetworkConfig) (configs []*types.NetworkConfig, retErr error) { +// Add creates the netns (if absent) and allocates each NIC's CNI plumbing. +func (c *CNI) Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, specs ...network.AddSpec) (configs []*types.NetworkConfig, retErr error) { if c.cniConf == nil { return nil, fmt.Errorf("%w: no conflist found in %s", network.ErrNotConfigured, c.conf.CNIConfDir) } + if len(specs) == 0 { + return nil, nil + } confList, err := c.confListByName(vmCfg.Network) if err != nil { return nil, err } vmCfg.Network = confList.Name - logger := log.WithFunc("cni.Config") + logger := log.WithFunc("cni.Add") nsName := netnsName(vmID) nsPath := netnsPath(vmID) - if err := createNetns(nsName); err != nil { - return nil, fmt.Errorf("create netns %s: %w", nsName, err) + createdNetns, err := ensureNetns(nsName, nsPath) + if err != nil { + return nil, fmt.Errorf("ensure netns %s: %w", nsName, err) } - var addedIFs []string + addedIdx := make([]int, 0, len(specs)) defer func() { if retErr == nil { return } - for _, ifn := range addedIFs { - rt := &libcni.RuntimeConf{ - ContainerID: vmID, - NetNS: nsPath, - IfName: ifn, - } - if delErr := c.cniConf.DelNetworkList(ctx, confList, rt); delErr != nil { + for _, i := range addedIdx { + ifn := fmt.Sprintf("eth%d", i) + if delErr := c.cniDel(ctx, confList, vmID, nsPath, ifn); delErr != nil { logger.Warnf(ctx, "rollback CNI DEL %s/%s: %v", vmID, ifn, delErr) } + // setupTCRedirect creates the TAP; it would leak if the netns persists. + if !createdNetns { + if delErr := deleteTAPInNetns(nsPath, tapNameForVM(vmID, i)); delErr != nil { + logger.Warnf(ctx, "rollback tap delete %s: %v", tapNameForVM(vmID, i), delErr) + } + } } - _ = deleteNetns(ctx, nsName) - }() - - for i := range numNICs { - ifName := fmt.Sprintf("eth%d", i) - tapName := tapNameForVM(vmID, i) - - rt := &libcni.RuntimeConf{ - ContainerID: vmID, - NetNS: nsPath, - IfName: ifName, + if createdNetns { + _ = deleteNetns(ctx, nsName) } + }() - if i < len(existing) && existing[i] != nil { - if delErr := c.cniConf.DelNetworkList(ctx, confList, rt); delErr != nil { + type freshNIC struct { + index int + cfg *types.NetworkConfig + } + configs = make([]*types.NetworkConfig, 0, len(specs)) + fresh := make([]freshNIC, 0, len(specs)) + for _, spec := range specs { + ifName := fmt.Sprintf("eth%d", spec.Index) + tapName := tapNameForVM(vmID, spec.Index) + + rt := &libcni.RuntimeConf{ContainerID: vmID, NetNS: nsPath, IfName: ifName} + if spec.Existing != nil { + if delErr := c.cniDel(ctx, confList, vmID, nsPath, ifName); delErr != nil { logger.Warnf(ctx, "pre-recovery CNI DEL %s/%s: %v (continuing)", vmID, ifName, delErr) } - if existing[i].Network != nil && existing[i].Network.IP != "" { - rt.Args = [][2]string{{"IgnoreUnknown", "1"}, {"IP", existing[i].Network.IP}} + if spec.Existing.Network != nil && spec.Existing.Network.IP != "" { + rt.Args = [][2]string{{"IgnoreUnknown", "1"}, {"IP", spec.Existing.Network.IP}} } } - cniResult, err := c.cniConf.AddNetworkList(ctx, confList, rt) - if err != nil { - return nil, fmt.Errorf("cni add %s/%s: %w", vmID, ifName, err) + cniResult, addErr := c.cniConf.AddNetworkList(ctx, confList, rt) + if addErr != nil { + return nil, fmt.Errorf("cni add %s/%s: %w", vmID, ifName, addErr) } - addedIFs = append(addedIFs, ifName) + addedIdx = append(addedIdx, spec.Index) - netInfo, err := extractNetworkInfo(cniResult) - if err != nil { - return nil, fmt.Errorf("parse CNI result: %w", err) + netInfo, parseErr := extractNetworkInfo(cniResult) + if parseErr != nil { + return nil, fmt.Errorf("parse CNI result: %w", parseErr) } var overrideMAC string - if i < len(existing) && existing[i] != nil { - overrideMAC = existing[i].MAC + if spec.Existing != nil { + overrideMAC = spec.Existing.MAC } mac, setupErr := setupTCRedirect(nsPath, ifName, tapName, network.NetNumQueues(vmCfg.CPU), overrideMAC) if setupErr != nil { return nil, fmt.Errorf("setup tc-redirect %s: %w", vmID, setupErr) } - configs = append(configs, &types.NetworkConfig{ + cfg := &types.NetworkConfig{ TAP: tapName, MAC: mac, NumQueues: network.NetNumQueues(vmCfg.CPU), QueueSize: network.ResolveQueueSize(vmCfg.QueueSize), - Backend: typ, + Backend: types.BackendCNI, NetnsPath: nsPath, Network: netInfo, - }) + } + configs = append(configs, cfg) + if spec.Existing == nil { + fresh = append(fresh, freshNIC{index: spec.Index, cfg: cfg}) + } var logIP, logGW string if netInfo != nil { @@ -106,36 +122,82 @@ func (c *CNI) Config(ctx context.Context, vmID string, numNICs int, vmCfg *types logGW = netInfo.Gateway } logger.Debugf(ctx, "NIC %d: %s ip=%s gw=%s tap=%s mac=%s", - i, ifName, logIP, logGW, tapName, mac) - } - - if len(existing) > 0 { - return configs, nil + spec.Index, ifName, logIP, logGW, tapName, mac) } return configs, c.store.Update(ctx, func(idx *networkIndex) error { - for i, cfg := range configs { + for _, f := range fresh { netID := utils.GenerateID() var net types.Network - if cfg.Network != nil { - net = *cfg.Network + if f.cfg.Network != nil { + net = *f.cfg.Network } idx.Networks[netID] = &networkRecord{ ID: netID, Type: confList.Name, Network: net, VMID: vmID, - IfName: fmt.Sprintf("eth%d", i), + IfName: fmt.Sprintf("eth%d", f.index), } } return nil }) } +// Remove tears down NIC plumbing for the given indices; preserves the netns. +// Always sweeps DB records for picked indices so resize-up to the same index isn't stale; CNI/TAP errors still propagate. +func (c *CNI) Remove(ctx context.Context, vmID string, indices ...int) error { + if len(indices) == 0 { + return nil + } + var records []networkRecord + if err := c.store.With(ctx, func(idx *networkIndex) error { + records = idx.byVMID(vmID) + return nil + }); err != nil { + return fmt.Errorf("read network index: %w", err) + } + byIfName := make(map[string]networkRecord, len(records)) + for _, r := range records { + byIfName[r.IfName] = r + } + picked := make([]networkRecord, 0, len(indices)) + pickedIDs := make([]string, 0, len(indices)) + for _, i := range indices { + ifName := fmt.Sprintf("eth%d", i) + rec, ok := byIfName[ifName] + if !ok { + return fmt.Errorf("nic %d (%s): no record", i, ifName) + } + picked = append(picked, rec) + pickedIDs = append(pickedIDs, rec.ID) + } + err := c.tearDownNICs(ctx, vmID, netnsPath(vmID), picked, true, false) + return errors.Join(err, c.deleteRecords(ctx, pickedIDs)) +} + +func (c *CNI) cniDel(ctx context.Context, confList *libcni.NetworkConfigList, vmID, nsPath, ifName string) error { + rt := &libcni.RuntimeConf{ContainerID: vmID, NetNS: nsPath, IfName: ifName} + return c.cniConf.DelNetworkList(ctx, confList, rt) +} + func tapNameForVM(vmID string, nic int) string { return fmt.Sprintf("tap%s-%d", network.VMIDPrefix(vmID), nic) } +// ensureNetns creates the netns if missing; bool reports whether this call did the creation. +func ensureNetns(name, nsPath string) (bool, error) { + if _, err := os.Stat(nsPath); err == nil { + return false, nil + } else if !errors.Is(err, fs.ErrNotExist) { + return false, err + } + if err := createNetns(name); err != nil { + return false, err + } + return true, nil +} + // extractNetworkInfo converts a CNI ADD result into types.Network. func extractNetworkInfo(result cnitypes.Result) (*types.Network, error) { newResult, err := current.NewResultFromResult(result) diff --git a/network/cni/create_darwin.go b/network/cni/create_darwin.go index 9fd783ea..380e7411 100644 --- a/network/cni/create_darwin.go +++ b/network/cni/create_darwin.go @@ -18,3 +18,7 @@ func deleteNetns(_ context.Context, _ string) error { func setupTCRedirect(_, _, _ string, _ int, _ string) (string, error) { return "", errNotSupported } + +func deleteTAPInNetns(_, _ string) error { + return errNotSupported +} diff --git a/network/cni/create_linux.go b/network/cni/create_linux.go index 2ad228f8..39800604 100644 --- a/network/cni/create_linux.go +++ b/network/cni/create_linux.go @@ -52,6 +52,17 @@ func deleteNetns(ctx context.Context, name string) error { }) } +// deleteTAPInNetns deletes a named TAP device inside target netns. +func deleteTAPInNetns(nsPath, tapName string) error { + return cns.WithNetNSPath(nsPath, func(_ cns.NetNS) error { + link, err := netlink.LinkByName(tapName) + if err != nil { + return fmt.Errorf("find %s: %w", tapName, err) + } + return netlink.LinkDel(link) + }) +} + // setupTCRedirect wires ifName <-> tapName inside target netns, returns MAC. func setupTCRedirect(nsPath, ifName, tapName string, queues int, overrideMAC string) (string, error) { var mac string diff --git a/network/cni/gc.go b/network/cni/gc.go index 0bcd0c47..d96937d6 100644 --- a/network/cni/gc.go +++ b/network/cni/gc.go @@ -76,7 +76,7 @@ func (c *CNI) GCModule() gc.Module[cniSnapshot] { } // 2. CNI DEL per NIC — best-effort IPAM release. - c.delNICs(ctx, vmID, netnsPath(vmID), records) + _ = c.tearDownNICs(ctx, vmID, netnsPath(vmID), records, false, true) // 3. Remove the named netns (with retry for async kernel fd cleanup). nsName := netnsName(vmID) diff --git a/network/network.go b/network/network.go index e9c5d726..ef4b9d24 100644 --- a/network/network.go +++ b/network/network.go @@ -13,21 +13,38 @@ var ( ErrNotConfigured = errors.New("network provider not configured") ) -// Network defines the interface for a network provider (CNI, etc.). +// AddSpec is one NIC's add request; Existing != nil reuses MAC/IP for recovery. +type AddSpec struct { + Index int + Existing *types.NetworkConfig +} + +// Network is the per-VM host-side networking provider (CNI, bridge, ...). type Network interface { Type() string - - // Verify checks whether the network namespace for a VM exists. - // Returns nil if the netns is present, an error otherwise. Verify(ctx context.Context, vmID string) error - // Config creates network namespace, bridge, and tap for a VM. - // When existing configs are provided (recovery after host reboot), - // the netns and tap devices are recreated using the persisted MAC addresses. - // NOTE: vmCfg.Network may be mutated to record the resolved conflist name. - Config(ctx context.Context, vmID string, numNICs int, vmCfg *types.VMConfig, existing ...*types.NetworkConfig) ([]*types.NetworkConfig, error) + Add(ctx context.Context, vmID string, vmCfg *types.VMConfig, specs ...AddSpec) ([]*types.NetworkConfig, error) + Remove(ctx context.Context, vmID string, indices ...int) error Delete(context.Context, []string) ([]string, error) Inspect(context.Context, string) (*types.Network, error) List(context.Context) ([]*types.Network, error) - RegisterGC(*gc.Orchestrator) } + +// AddRange builds AddSpecs for a contiguous block of fresh NIC indices. +func AddRange(from, count int) []AddSpec { + out := make([]AddSpec, count) + for i := range out { + out[i] = AddSpec{Index: from + i} + } + return out +} + +// AddRecover builds AddSpecs for re-creating existing NICs (post-reboot recovery). +func AddRecover(existing []*types.NetworkConfig) []AddSpec { + out := make([]AddSpec, len(existing)) + for i, e := range existing { + out[i] = AddSpec{Index: i, Existing: e} + } + return out +} diff --git a/types/network.go b/types/network.go index 399d9d77..d7b6ffc1 100644 --- a/types/network.go +++ b/types/network.go @@ -1,5 +1,11 @@ package types +// Network backend identifiers stored in NetworkConfig.Backend. +const ( + BackendCNI = "cni" + BackendBridge = "bridge" +) + // NetworkConfig describes a single NIC attached to a VM. type NetworkConfig struct { TAP string `json:"tap"`