eleostech · pnc · May 15, 2026 · May 15, 2026 · May 21, 2026 · May 21, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -4,8 +4,9 @@
 
 ## Development workflow
 
-Before running the e2e tests, ensure the test prerequisites from
-HACKING.md are installed.
+The e2e tests run inside this VM. If `uv` is not yet installed, install
+it first (see Tooling policy below). Ensure the test prerequisites from
+HACKING.md are also installed.
 
 Always run the test suite before committing:
 
@@ -15,6 +16,24 @@ uv run pytest tests/test_e2e.py -v -s
 
 The test boots the VM end-to-end (takes ~90s without KVM) and verifies `curl https://pypi.org` works through mitmproxy. Do not commit if this fails.
 
+### Fixing CI failures
+
+When a test fails in CI but passes locally, **reproduce the failure locally
+before applying a fix.** This VM has KVM, but CI may not — one known
+divergence is the QEMU CPU model (`-cpu host` with KVM vs `-cpu max` with
+TCG). To match CI's TCG environment:
+
+```bash
+QEMU_ACCEL=tcg uv run pytest tests/test_e2e.py::test_that_failed -v -s
+```
+
+The workflow is:
+
+1. **Reproduce** — run the failing test under CI-like conditions and confirm it fails.
+2. **Fix** — apply the change.
+3. **Verify** — re-run under the same conditions and confirm it passes.
+4. **Full suite** — run the complete test suite to check for regressions.
+
 The full suite including the network isolation tests can take 5+ minutes under TCG emulation. TCG is slower than KVM but not *that* slow — if cloud-init status is unchanged for more than a minute, check the console log and process list rather than assuming it's just slow. A dead QEMU process or OOM kill is more likely than TCG being the bottleneck.
 
 Launch the test with `Bash` using `run_in_background: true`, then immediately attach a `Monitor` to tail the output file with a progress filter. This keeps the conversation unblocked while streaming results:

diff --git a/allowlist.txt b/allowlist.txt
@@ -59,20 +59,27 @@ GET https://api.anthropic.com/api/hello
 # astral.sh; binary downloads come from releases.astral.sh (or GitHub
 # release assets as a fallback).  URLs vary by version and platform.
 GET https://astral.sh/uv/install.sh
+GET https://releases.astral.sh/installers/uv/*
 GET https://releases.astral.sh/github/uv/releases/*
 GET https://github.com/astral-sh/uv/releases/*
 GET https://release-assets.githubusercontent.com/github-production-release-asset/*
 
-# ── Docker Hub ────────────────────────────────────────────────────
-# Registry API — paths vary by image name, tag, and sha256 digest
-# (e.g. /v2/library/hello-world/manifests/latest).  Scoped to /v2/.
-GET  https://registry-1.docker.io/v2/*
-# Auth tokens — the registry returns 401 with a token URL whose
-# query parameters vary per request (scope, service, etc.).
+# ── Docker Hub (hello-world only) ─────────────────────────────────
+# Scoped to the library/hello-world image used by the e2e test.
+# To pull other images, add their specific paths here.
+# /v2/ (bare) is Docker's registry version check — required before
+# any image pull.
+GET  https://registry-1.docker.io/v2/
+GET  https://registry-1.docker.io/v2/library/hello-world/*
+# Auth tokens — scoped to the hello-world repository.
 GET https://auth.docker.io/token*
-# Blob storage — the registry redirects layer downloads to this
-# Cloudflare R2 bucket.  Paths contain per-blob sha256 digests.
+# Blob storage — the registry redirects layer downloads to either
+# a Cloudflare R2 bucket or CloudFront CDN.  Blob paths contain
+# sha256 digests that can't be scoped per-image, but the registry
+# only returns redirect URLs for layers belonging to images the
+# client already resolved via the scoped manifest rules above.
 GET https://docker-images-prod.6aa30f8b08e16409b46e0173d6de2f56.r2.cloudflarestorage.com/registry-v2/*
+GET https://production.cloudfront.docker.com/registry-v2/*
 
 # ── Debian cloud images (nested VM testing only) ──────────────────
 # Only needed when running the e2e test suite inside a VM (i.e. the

diff --git a/cloud-init/user-data b/cloud-init/user-data
@@ -21,12 +21,18 @@ bootcmd:
         wget -qO /usr/local/share/ca-certificates/mitmproxy.crt http://mitm.it/cert/pem
       update-ca-certificates
     fi
-  # Disable initramfs rebuilds — this is an ephemeral VM that is never
-  # rebooted.  The generic kernel's mkinitramfs takes 2+ minutes under
-  # TCG because it copies hundreds of driver modules.
-  - dpkg-divert --local --rename --add /usr/sbin/update-initramfs
-  - ln -sf /bin/true /usr/sbin/update-initramfs
-
+  # Suppress initramfs rebuilds during first-boot provisioning only.
+  # Package installs (docker.io, etc.) trigger the initramfs-tools dpkg
+  # hook, and a full rebuild takes minutes under TCG emulation.  The
+  # diversion is undone in runcmd so that future kernel upgrades (via
+  # unattended-upgrades) generate a working initramfs.
+  # Guard: boot-finished is written at the very end of cloud-init's final
+  # stage, so it won't exist during first boot but will on all subsequent.
+  - |
+    if [ ! -f /var/lib/cloud/instance/boot-finished ]; then
+      dpkg-divert --local --rename --add /usr/sbin/update-initramfs 2>/dev/null
+      ln -sf /bin/true /usr/sbin/update-initramfs
+    fi
 write_files:
   - path: /etc/apt/apt.conf.d/90proxy
     content: |
@@ -64,6 +70,13 @@ write_files:
       Environment="HTTPS_PROXY=http://__HOST_IP__:__PROXY_PORT__"
       Environment="NO_PROXY=localhost,127.0.0.1,__HOST_IP__"
 
+  # The default MODULES=most copies hundreds of bare-metal drivers into
+  # the initramfs.  Under TCG emulation this takes minutes.  MODULES=dep
+  # limits it to modules for detected hardware (virtio), cutting the
+  # rebuild from ~5 min to seconds.
+  - path: /etc/initramfs-tools/conf.d/vm-modules-dep
+    content: |
+      MODULES=dep
   - path: /etc/systemd/system/mnt-9p.mount
     content: |
       [Unit]
@@ -184,6 +197,14 @@ packages:
   - docker.io
 
 runcmd:
+  # Undo the update-initramfs diversion applied during first-boot
+  # provisioning (see bootcmd).  From this point on, kernel upgrades
+  # will generate a proper initramfs.
+  - |
+    if dpkg-divert --list /usr/sbin/update-initramfs 2>/dev/null | grep -q diversion; then
+      rm -f /usr/sbin/update-initramfs
+      dpkg-divert --local --rename --remove /usr/sbin/update-initramfs
+    fi
   - mkdir -p /mnt/9p /home/vm/shared
   - systemctl daemon-reload
   - systemctl enable --now mnt-9p.mount
@@ -200,8 +221,27 @@ runcmd:
   # from /etc/profile.d/proxy.sh).  --no-modify-path because proxy.sh
   # already adds ~/.local/bin to PATH.  Binary lands in /home/vm/.local/bin/.
   - su - vm -c 'curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --no-modify-path'
-  # Install Claude Code CLI.
-  - su - vm -c 'curl -fsSL https://claude.ai/install.sh | bash'
+  # Install Claude Code CLI.  The official install script runs
+  # `claude install` after downloading, which maps ~70 GB of virtual
+  # memory.  Under TCG emulation this either triggers an invalid-opcode
+  # trap (qemu64 lacks the required instructions) or takes so long that
+  # cloud-init times out.  Download the binary directly instead.
+  - |
+    su - vm -c '
+      set -e
+      DOWNLOAD_BASE="https://downloads.claude.ai/claude-code-releases"
+      case "$(uname -m)" in
+        x86_64|amd64) platform="linux-x64" ;;
+        aarch64|arm64) platform="linux-arm64" ;;
+        *) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;;
+      esac
+      version=$(curl -fsSL "$DOWNLOAD_BASE/latest")
+      mkdir -p ~/.local/share/claude/versions ~/.local/bin
+      curl -fsSL -o ~/.local/share/claude/versions/"$version" \
+        "$DOWNLOAD_BASE/$version/$platform/claude"
+      chmod +x ~/.local/share/claude/versions/"$version"
+      ln -sf ~/.local/share/claude/versions/"$version" ~/.local/bin/claude
+    '
   # Propagate the host user's git identity into the VM so commits
   # made inside the guest have the correct author.  The placeholders
   # are substituted by vm.py from `git config --global`; if the host

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -181,7 +181,7 @@ def running_vm():
     # vm.py start runs mitmproxy in the background and QEMU in the foreground.
     # Both inherit our file handles, so their output lands in console.log.
     vm_proc = subprocess.Popen(
-        [sys.executable, str(VM_PY), "start", "--memory", "512M",
+        [sys.executable, str(VM_PY), "start", "--memory", "2G",
          "--ssh-port", str(TEST_SSH_PORT),
          "--proxy-port", str(TEST_PROXY_PORT),
          "--extra-user-data", str(REPO / "tests" / "nmap.yaml")],
@@ -332,6 +332,45 @@ def test_docker_hello_world(running_vm):
     )
 
 
+def test_uv_installed(running_vm):
+    """uv should be installed and functional after cloud-init provisioning."""
+    _progress("Checking uv installation…")
+    r = _vm_ssh("bash -lc 'uv --version'", timeout=30)
+    assert r.returncode == 0, (
+        f"uv not installed or not on PATH (rc={r.returncode}):\n"
+        f"stdout: {r.stdout[:500]}\nstderr: {r.stderr[:500]}"
+    )
+    assert "uv" in r.stdout, f"Unexpected uv --version output: {r.stdout}"
+
+
+def test_claude_code_installed(running_vm):
+    """Claude Code CLI should be installed and functional after cloud-init provisioning."""
+    _progress("Checking Claude Code installation…")
+    r = _vm_ssh("bash -lc 'claude --version'", timeout=30)
+    if r.returncode != 0:
+        diag = _vm_ssh(
+            "bash -lc '"
+            "echo \"=== binary ===\"; ls -la ~/.local/bin/claude 2>&1; "
+            "echo \"=== versions ===\"; ls ~/.local/share/claude/versions/ 2>&1; "
+            "echo \"=== file ===\"; file $(readlink -f ~/.local/bin/claude) 2>&1; "
+            "echo \"=== ldd ===\"; ldd $(readlink -f ~/.local/bin/claude) 2>&1; "
+            "echo \"=== dmesg ===\"; sudo dmesg | tail -20 2>&1; "
+            "echo \"=== free ===\"; free -h 2>&1; "
+            "echo \"=== PATH ===\"; echo PATH=$PATH'",
+            timeout=10,
+        )
+        assert False, (
+            f"claude not installed or not on PATH (rc={r.returncode}):\n"
+            f"stderr: {r.stderr[:500]}\n"
+            f"diagnostics:\n{diag.stdout[:2000]}"
+        )
+    output = (r.stdout + r.stderr).lower()
+    assert "claude" in output, (
+        f"Unexpected claude --version output:\n"
+        f"stdout: {r.stdout!r}\nstderr: {r.stderr!r}"
+    )
+
+
 def test_blocked_domain(running_vm):
     """Requests to domains not in filter.py's allowlist should be blocked with 403."""
     result = _vm_ssh(
@@ -676,3 +715,125 @@ def test_guest_cannot_modify_host_allowlist(running_vm):
         _vm_ssh(f"rm -f ~/shared/{marker} 2>/dev/null; true", timeout=10)
         # Safety net: restore original content in case the test failed
         allowlist_path.write_text(original_content)
+
+
+# ---------------------------------------------------------------------------
+# Kernel upgrade + reboot
+# ---------------------------------------------------------------------------
+
+
+def test_kernel_install_and_reboot(running_vm):
+    """Installing a new kernel and rebooting must not kernel panic.
+
+    The base cloud-init config once diverted update-initramfs to /bin/true
+    to speed up provisioning (~2 min saved under TCG emulation).  This was
+    safe under the assumption that the VM was ephemeral and never rebooted.
+    In practice, Debian's unattended-upgrades installs kernel security
+    updates on a daily timer.  Because update-initramfs was a no-op, the
+    new kernel shipped without an initramfs.  GRUB's os-prober still picked
+    up the new vmlinuz and made it the default boot entry — but with no
+    initrd line.  On next boot the kernel couldn't load the virtio_blk
+    module (it lives in the initramfs, not built-in), so the root disk was
+    invisible and the kernel panicked:
+
+        VFS: Cannot open root device "PARTUUID=..." or unknown-block(0,0)
+        Kernel panic - not syncing: VFS: Unable to mount root fs
+
+    This test reproduces that scenario end-to-end: install a second kernel
+    flavor, set GRUB to boot it, and reboot.  If update-initramfs is broken,
+    the VM kernel-panics and SSH never comes back.
+
+    Placed last because it reboots the VM.
+    """
+    # Detect guest architecture to pick the right cloud kernel package.
+    r = _vm_ssh("dpkg --print-architecture", timeout=10)
+    assert r.returncode == 0
+    arch = r.stdout.strip()
+    cloud_pkg = f"linux-image-cloud-{arch}"
+
+    _progress(f"Installing {cloud_pkg}…")
+    r = _vm_ssh(
+        f"bash -lc 'sudo apt-get install -y -qq {cloud_pkg} 2>&1'",
+        timeout=600,
+    )
+    assert r.returncode == 0, (
+        f"Kernel install failed (rc={r.returncode}):\n"
+        f"{r.stdout[-2000:]}\n{r.stderr[-2000:]}"
+    )
+
+    # Find the newly installed cloud kernel version.
+    r = _vm_ssh(f"ls /boot/vmlinuz-*-cloud-{arch}", timeout=10)
+    assert r.returncode == 0, f"No cloud kernel found in /boot:\n{r.stderr}"
+    cloud_vmlinuz = r.stdout.strip().splitlines()[-1].strip()
+    cloud_version = cloud_vmlinuz.rsplit("/", 1)[-1].removeprefix("vmlinuz-")
+    _progress(f"Installed cloud kernel: {cloud_version}")
+
+    # Verify the initramfs was created for it.
+    r = _vm_ssh(f"test -f /boot/initrd.img-{cloud_version}", timeout=10)
+    assert r.returncode == 0, (
+        f"initrd.img-{cloud_version} was not created.\n"
+        "update-initramfs is likely diverted to /bin/true."
+    )
+
+    # Set GRUB to boot the cloud kernel by default.
+    # Read the root filesystem UUID from the running VM rather than
+    # hardcoding a PARTUUID that is specific to one image build.
+    r = _vm_ssh(
+        "sudo grub-probe --target=fs_uuid /",
+        timeout=10,
+    )
+    assert r.returncode == 0, f"Cannot determine root FS UUID:\n{r.stderr}"
+    root_uuid = r.stdout.strip()
+    grub_entry = f"gnulinux-advanced-{root_uuid}>gnulinux-{cloud_version}-advanced-{root_uuid}"
+    _vm_ssh(
+        f"sudo grub-set-default '{grub_entry}' 2>&1",
+        timeout=10,
+    )
+    _vm_ssh("sudo update-grub 2>&1", timeout=60)
+
+    # Verify GRUB config has an initrd line for the cloud kernel.
+    r = _vm_ssh("cat /boot/grub/grub.cfg", timeout=10)
+    assert f"initrd\t/boot/initrd.img-{cloud_version}" in r.stdout, (
+        f"GRUB config missing initrd for {cloud_version}."
+    )
+
+    _progress("Rebooting into cloud kernel…")
+    _vm_ssh("sudo reboot", timeout=10)
+
+    # Wait for SSH to go down.
+    time.sleep(10)
+
+    # Wait for SSH to come back — if the kernel panicked, it never will.
+    deadline = time.monotonic() + BOOT_TIMEOUT
+    attempt = 0
+    while time.monotonic() < deadline:
+        if running_vm.poll() is not None:
+            _dump_logs()
+            pytest.fail(
+                "QEMU exited during reboot — likely kernel panic.\n"
+                "Check console log above."
+            )
+        attempt += 1
+        remaining = int(deadline - time.monotonic())
+        _progress(f"Post-reboot SSH probe #{attempt} ({remaining}s remaining)…")
+        try:
+            r = _vm_ssh("true", timeout=10)
+            if r.returncode == 0:
+                _progress(f"VM back after reboot ({attempt} probe(s))")
+                break
+        except subprocess.TimeoutExpired:
+            pass
+        time.sleep(SSH_POLL_INTERVAL)
+    else:
+        _dump_logs()
+        pytest.fail(
+            f"VM did not come back after reboot within {BOOT_TIMEOUT}s.\n"
+            "Likely kernel panic due to missing initramfs."
+        )
+
+    # Confirm we're running the new kernel.
+    r = _vm_ssh("uname -r", timeout=10)
+    _progress(f"Running kernel after reboot: {r.stdout.strip()}")
+    assert "cloud" in r.stdout, (
+        f"Expected to boot cloud kernel, got: {r.stdout.strip()}"
+    )
diff --git a/vm.py b/vm.py
@@ -166,7 +166,7 @@ def machine_args(self) -> list[str]:
         if self.arch == Arch.ARM64:
             cpu = "host" if self._accel == "hvf" else "cortex-a57"
             return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu]
-        cpu = "host" if self._accel == "hvf" else "qemu64"
+        cpu = "host" if self._accel == "hvf" else "max"
         return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu]
 
     def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]:
@@ -200,7 +200,7 @@ def machine_args(self) -> list[str]:
         if self.arch == Arch.ARM64:
             cpu = "host" if self._accel == "kvm" else "cortex-a57"
             return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu]
-        cpu = "host" if self._accel == "kvm" else "qemu64"
+        cpu = "host" if self._accel == "kvm" else "max"
         return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu]
 
     def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]: