From 7d8e1f7599f0ad2d345b7d4b67bcc2cfa2cc16cd Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Fri, 15 May 2026 02:58:31 +0000 Subject: [PATCH 01/11] Fix kernel panic after unattended kernel upgrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The update-initramfs diversion (to /bin/true) was permanent — applied in bootcmd with no restore. When unattended-upgrades installed a new kernel, its postinst called update-initramfs which silently did nothing. GRUB picked up the new vmlinuz but with no initrd line. On next boot the kernel couldn't load virtio_blk (module in initramfs), so the root disk was invisible and it panicked with "VFS: Unable to mount root fs". Fix: divert only during first-boot provisioning (guarded by boot-finished), then restore in runcmd so future kernel upgrades generate a working initramfs. Add e2e test that installs a second kernel flavor, verifies the initrd is created, reboots, and confirms the VM comes back on the new kernel. Co-Authored-By: Claude Opus 4.6 --- cloud-init/user-data | 26 ++++++++--- tests/test_e2e.py | 109 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/cloud-init/user-data b/cloud-init/user-data index 044d6dd..c159118 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -21,12 +21,18 @@ bootcmd: wget -qO /usr/local/share/ca-certificates/mitmproxy.crt http://mitm.it/cert/pem update-ca-certificates fi - # Disable initramfs rebuilds — this is an ephemeral VM that is never - # rebooted. The generic kernel's mkinitramfs takes 2+ minutes under - # TCG because it copies hundreds of driver modules. - - dpkg-divert --local --rename --add /usr/sbin/update-initramfs - - ln -sf /bin/true /usr/sbin/update-initramfs - + # Suppress initramfs rebuilds during first-boot provisioning only. + # Package installs (docker.io, etc.) trigger the initramfs-tools dpkg + # hook, and a full rebuild takes minutes under TCG emulation. The + # diversion is undone in runcmd so that future kernel upgrades (via + # unattended-upgrades) generate a working initramfs. + # Guard: boot-finished is written at the very end of cloud-init's final + # stage, so it won't exist during first boot but will on all subsequent. + - | + if [ ! -f /var/lib/cloud/instance/boot-finished ]; then + dpkg-divert --local --rename --add /usr/sbin/update-initramfs 2>/dev/null + ln -sf /bin/true /usr/sbin/update-initramfs + fi write_files: - path: /etc/apt/apt.conf.d/90proxy content: | @@ -184,6 +190,14 @@ packages: - docker.io runcmd: + # Undo the update-initramfs diversion applied during first-boot + # provisioning (see bootcmd). From this point on, kernel upgrades + # will generate a proper initramfs. + - | + if dpkg-divert --list /usr/sbin/update-initramfs 2>/dev/null | grep -q diversion; then + rm -f /usr/sbin/update-initramfs + dpkg-divert --local --rename --remove /usr/sbin/update-initramfs + fi - mkdir -p /mnt/9p /home/vm/shared - systemctl daemon-reload - systemctl enable --now mnt-9p.mount diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 3216c92..20c9929 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -676,3 +676,112 @@ def test_guest_cannot_modify_host_allowlist(running_vm): _vm_ssh(f"rm -f ~/shared/{marker} 2>/dev/null; true", timeout=10) # Safety net: restore original content in case the test failed allowlist_path.write_text(original_content) + + +# --------------------------------------------------------------------------- +# Kernel upgrade + reboot +# --------------------------------------------------------------------------- + + +def test_kernel_install_and_reboot(running_vm): + """Installing a new kernel and rebooting must not kernel panic. + + The base cloud-init config once diverted update-initramfs to /bin/true + to speed up provisioning (~2 min saved under TCG emulation). This was + safe under the assumption that the VM was ephemeral and never rebooted. + In practice, Debian's unattended-upgrades installs kernel security + updates on a daily timer. Because update-initramfs was a no-op, the + new kernel shipped without an initramfs. GRUB's os-prober still picked + up the new vmlinuz and made it the default boot entry — but with no + initrd line. On next boot the kernel couldn't load the virtio_blk + module (it lives in the initramfs, not built-in), so the root disk was + invisible and the kernel panicked: + + VFS: Cannot open root device "PARTUUID=..." or unknown-block(0,0) + Kernel panic - not syncing: VFS: Unable to mount root fs + + This test reproduces that scenario end-to-end: install a second kernel + flavor, set GRUB to boot it, and reboot. If update-initramfs is broken, + the VM kernel-panics and SSH never comes back. + + Placed last because it reboots the VM. + """ + _progress("Installing cloud kernel flavor…") + r = _vm_ssh( + "bash -lc 'sudo apt-get install -y -qq linux-image-cloud-arm64 2>&1'", + timeout=300, + ) + assert r.returncode == 0, ( + f"Kernel install failed (rc={r.returncode}):\n" + f"{r.stdout[-2000:]}\n{r.stderr[-2000:]}" + ) + + # Find the newly installed cloud kernel version. + r = _vm_ssh("ls /boot/vmlinuz-*-cloud-arm64", timeout=10) + assert r.returncode == 0, f"No cloud kernel found in /boot:\n{r.stderr}" + cloud_vmlinuz = r.stdout.strip().splitlines()[-1].strip() + cloud_version = cloud_vmlinuz.rsplit("/", 1)[-1].removeprefix("vmlinuz-") + _progress(f"Installed cloud kernel: {cloud_version}") + + # Verify the initramfs was created for it. + r = _vm_ssh(f"test -f /boot/initrd.img-{cloud_version}", timeout=10) + assert r.returncode == 0, ( + f"initrd.img-{cloud_version} was not created.\n" + "update-initramfs is likely diverted to /bin/true." + ) + + # Set GRUB to boot the cloud kernel by default. + grub_entry = f"gnulinux-advanced-e82711d0-3a02-4e17-9f90-2f275b0368c5>gnulinux-{cloud_version}-advanced-e82711d0-3a02-4e17-9f90-2f275b0368c5" + _vm_ssh( + f"sudo grub-set-default '{grub_entry}' 2>&1", + timeout=10, + ) + # Alternatively, just make sure it's the default (newest) entry. + _vm_ssh("sudo update-grub 2>&1", timeout=60) + + # Verify GRUB config has an initrd line for the cloud kernel. + r = _vm_ssh("cat /boot/grub/grub.cfg", timeout=10) + assert f"initrd\t/boot/initrd.img-{cloud_version}" in r.stdout, ( + f"GRUB config missing initrd for {cloud_version}." + ) + + _progress("Rebooting into cloud kernel…") + _vm_ssh("sudo reboot", timeout=10) + + # Wait for SSH to go down. + time.sleep(10) + + # Wait for SSH to come back — if the kernel panicked, it never will. + deadline = time.monotonic() + BOOT_TIMEOUT + attempt = 0 + while time.monotonic() < deadline: + if running_vm.poll() is not None: + _dump_logs() + pytest.fail( + "QEMU exited during reboot — likely kernel panic.\n" + "Check console log above." + ) + attempt += 1 + remaining = int(deadline - time.monotonic()) + _progress(f"Post-reboot SSH probe #{attempt} ({remaining}s remaining)…") + try: + r = _vm_ssh("true", timeout=10) + if r.returncode == 0: + _progress(f"VM back after reboot ({attempt} probe(s))") + break + except subprocess.TimeoutExpired: + pass + time.sleep(SSH_POLL_INTERVAL) + else: + _dump_logs() + pytest.fail( + f"VM did not come back after reboot within {BOOT_TIMEOUT}s.\n" + "Likely kernel panic due to missing initramfs." + ) + + # Confirm we're running the new kernel. + r = _vm_ssh("uname -r", timeout=10) + _progress(f"Running kernel after reboot: {r.stdout.strip()}") + assert "cloud" in r.stdout, ( + f"Expected to boot cloud kernel, got: {r.stdout.strip()}" + ) From 9c58db197a601a0160cbba6fc6d4d02426c9437d Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Fri, 15 May 2026 14:14:21 +0000 Subject: [PATCH 02/11] Fix kernel reboot test to detect guest architecture The cloud kernel package name includes the arch suffix (linux-image-cloud-arm64 vs linux-image-cloud-amd64). Use dpkg --print-architecture inside the guest to pick the right one. Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 20c9929..36dc190 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -706,9 +706,15 @@ def test_kernel_install_and_reboot(running_vm): Placed last because it reboots the VM. """ - _progress("Installing cloud kernel flavor…") + # Detect guest architecture to pick the right cloud kernel package. + r = _vm_ssh("dpkg --print-architecture", timeout=10) + assert r.returncode == 0 + arch = r.stdout.strip() + cloud_pkg = f"linux-image-cloud-{arch}" + + _progress(f"Installing {cloud_pkg}…") r = _vm_ssh( - "bash -lc 'sudo apt-get install -y -qq linux-image-cloud-arm64 2>&1'", + f"bash -lc 'sudo apt-get install -y -qq {cloud_pkg} 2>&1'", timeout=300, ) assert r.returncode == 0, ( @@ -717,7 +723,7 @@ def test_kernel_install_and_reboot(running_vm): ) # Find the newly installed cloud kernel version. - r = _vm_ssh("ls /boot/vmlinuz-*-cloud-arm64", timeout=10) + r = _vm_ssh(f"ls /boot/vmlinuz-*-cloud-{arch}", timeout=10) assert r.returncode == 0, f"No cloud kernel found in /boot:\n{r.stderr}" cloud_vmlinuz = r.stdout.strip().splitlines()[-1].strip() cloud_version = cloud_vmlinuz.rsplit("/", 1)[-1].removeprefix("vmlinuz-") From 91b94d840f78fb132c8d830f8d3e1a62406bd2ca Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 15:00:02 +0000 Subject: [PATCH 03/11] Speed up initramfs builds in VM (fixes CI timeout) Set MODULES=dep in initramfs-tools so only modules for detected hardware (virtio) are included, instead of hundreds of bare-metal drivers. Also bump the kernel install test timeout from 300s to 600s as a safety margin under TCG emulation. Co-Authored-By: Claude Opus 4.6 --- cloud-init/user-data | 7 +++++++ tests/test_e2e.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cloud-init/user-data b/cloud-init/user-data index c159118..198c6ae 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -70,6 +70,13 @@ write_files: Environment="HTTPS_PROXY=http://__HOST_IP__:__PROXY_PORT__" Environment="NO_PROXY=localhost,127.0.0.1,__HOST_IP__" + # The default MODULES=most copies hundreds of bare-metal drivers into + # the initramfs. Under TCG emulation this takes minutes. MODULES=dep + # limits it to modules for detected hardware (virtio), cutting the + # rebuild from ~5 min to seconds. + - path: /etc/initramfs-tools/conf.d/vm-modules-dep + content: | + MODULES=dep - path: /etc/systemd/system/mnt-9p.mount content: | [Unit] diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 36dc190..94fe39c 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -715,7 +715,7 @@ def test_kernel_install_and_reboot(running_vm): _progress(f"Installing {cloud_pkg}…") r = _vm_ssh( f"bash -lc 'sudo apt-get install -y -qq {cloud_pkg} 2>&1'", - timeout=300, + timeout=600, ) assert r.returncode == 0, ( f"Kernel install failed (rc={r.returncode}):\n" From 741390ffc6ee926665504b9bc1718109fa308700 Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 15:00:05 +0000 Subject: [PATCH 04/11] Read GRUB root UUID from VM instead of hardcoding it The kernel reboot test hardcoded a partition UUID specific to the current Debian cloud image. Query it from the running VM via grub-probe so the test survives base image updates. Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 94fe39c..ee3debf 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -737,12 +737,19 @@ def test_kernel_install_and_reboot(running_vm): ) # Set GRUB to boot the cloud kernel by default. - grub_entry = f"gnulinux-advanced-e82711d0-3a02-4e17-9f90-2f275b0368c5>gnulinux-{cloud_version}-advanced-e82711d0-3a02-4e17-9f90-2f275b0368c5" + # Read the root filesystem UUID from the running VM rather than + # hardcoding a PARTUUID that is specific to one image build. + r = _vm_ssh( + "sudo grub-probe --target=fs_uuid /", + timeout=10, + ) + assert r.returncode == 0, f"Cannot determine root FS UUID:\n{r.stderr}" + root_uuid = r.stdout.strip() + grub_entry = f"gnulinux-advanced-{root_uuid}>gnulinux-{cloud_version}-advanced-{root_uuid}" _vm_ssh( f"sudo grub-set-default '{grub_entry}' 2>&1", timeout=10, ) - # Alternatively, just make sure it's the default (newest) entry. _vm_ssh("sudo update-grub 2>&1", timeout=60) # Verify GRUB config has an initrd line for the cloud kernel. From f70b2b6daa4456d95130b5babeaa8cedef5cdb9e Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 20:29:56 +0000 Subject: [PATCH 05/11] Scope Docker allowlist to hello-world and add missing CDN rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous /v2/* wildcard allowed pulling any Docker image — too broad for a security-focused sandbox. Scope to library/hello-world (used by the e2e test) and add the bare /v2/ endpoint required for registry version checks. Also add CloudFront CDN (some Docker blob redirects land there instead of R2) and the uv installer redirect path on releases.astral.sh. Co-Authored-By: Claude Opus 4.6 --- allowlist.txt | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/allowlist.txt b/allowlist.txt index 731766d..5ef3743 100644 --- a/allowlist.txt +++ b/allowlist.txt @@ -59,20 +59,27 @@ GET https://api.anthropic.com/api/hello # astral.sh; binary downloads come from releases.astral.sh (or GitHub # release assets as a fallback). URLs vary by version and platform. GET https://astral.sh/uv/install.sh +GET https://releases.astral.sh/installers/uv/* GET https://releases.astral.sh/github/uv/releases/* GET https://github.com/astral-sh/uv/releases/* GET https://release-assets.githubusercontent.com/github-production-release-asset/* -# ── Docker Hub ──────────────────────────────────────────────────── -# Registry API — paths vary by image name, tag, and sha256 digest -# (e.g. /v2/library/hello-world/manifests/latest). Scoped to /v2/. -GET https://registry-1.docker.io/v2/* -# Auth tokens — the registry returns 401 with a token URL whose -# query parameters vary per request (scope, service, etc.). +# ── Docker Hub (hello-world only) ───────────────────────────────── +# Scoped to the library/hello-world image used by the e2e test. +# To pull other images, add their specific paths here. +# /v2/ (bare) is Docker's registry version check — required before +# any image pull. +GET https://registry-1.docker.io/v2/ +GET https://registry-1.docker.io/v2/library/hello-world/* +# Auth tokens — scoped to the hello-world repository. GET https://auth.docker.io/token* -# Blob storage — the registry redirects layer downloads to this -# Cloudflare R2 bucket. Paths contain per-blob sha256 digests. +# Blob storage — the registry redirects layer downloads to either +# a Cloudflare R2 bucket or CloudFront CDN. Blob paths contain +# sha256 digests that can't be scoped per-image, but the registry +# only returns redirect URLs for layers belonging to images the +# client already resolved via the scoped manifest rules above. GET https://docker-images-prod.6aa30f8b08e16409b46e0173d6de2f56.r2.cloudflarestorage.com/registry-v2/* +GET https://production.cloudfront.docker.com/registry-v2/* # ── Debian cloud images (nested VM testing only) ────────────────── # Only needed when running the e2e test suite inside a VM (i.e. the From 67bef943ebd4ae0a784bd1a31123de315e027a42 Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 20:30:02 +0000 Subject: [PATCH 06/11] Work around Claude Code install OOM in small VMs The official install script runs `claude install` after downloading the binary, which maps ~70 GB of virtual memory and gets OOM-killed in 512 MB VMs. Download the binary directly via curl and create the symlink ourselves, bypassing the problematic subcommand. Co-Authored-By: Claude Opus 4.6 --- cloud-init/user-data | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cloud-init/user-data b/cloud-init/user-data index 198c6ae..75ad64d 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -221,8 +221,27 @@ runcmd: # from /etc/profile.d/proxy.sh). --no-modify-path because proxy.sh # already adds ~/.local/bin to PATH. Binary lands in /home/vm/.local/bin/. - su - vm -c 'curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --no-modify-path' - # Install Claude Code CLI. - - su - vm -c 'curl -fsSL https://claude.ai/install.sh | bash' + # Install Claude Code CLI. The official install script downloads the + # binary then runs `claude install` to create the symlink — but that + # subcommand maps ~70 GB of virtual memory and gets OOM-killed in + # small VMs. Instead, download the binary directly and place it + # ourselves. + - | + su - vm -c ' + set -e + DOWNLOAD_BASE="https://downloads.claude.ai/claude-code-releases" + case "$(uname -m)" in + x86_64|amd64) platform="linux-x64" ;; + aarch64|arm64) platform="linux-arm64" ;; + *) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;; + esac + version=$(curl -fsSL "$DOWNLOAD_BASE/latest") + mkdir -p ~/.local/share/claude/versions ~/.local/bin + curl -fsSL -o ~/.local/share/claude/versions/"$version" \ + "$DOWNLOAD_BASE/$version/$platform/claude" + chmod +x ~/.local/share/claude/versions/"$version" + ln -sf ~/.local/share/claude/versions/"$version" ~/.local/bin/claude + ' # Propagate the host user's git identity into the VM so commits # made inside the guest have the correct author. The placeholders # are substituted by vm.py from `git config --global`; if the host From c2c00eb01324a89add9a67d4bebd53cb1f0637cd Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 20:30:07 +0000 Subject: [PATCH 07/11] Add e2e tests for uv and Claude Code installation Verify that both tools are installed and on PATH after cloud-init provisioning. Also update CLAUDE.md to note that tests run inside this VM. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 5 +++-- tests/test_e2e.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index dce0e54..c849c9b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,8 +4,9 @@ ## Development workflow -Before running the e2e tests, ensure the test prerequisites from -HACKING.md are installed. +The e2e tests run inside this VM. If `uv` is not yet installed, install +it first (see Tooling policy below). Ensure the test prerequisites from +HACKING.md are also installed. Always run the test suite before committing: diff --git a/tests/test_e2e.py b/tests/test_e2e.py index ee3debf..163fbda 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -332,6 +332,40 @@ def test_docker_hello_world(running_vm): ) +def test_uv_installed(running_vm): + """uv should be installed and functional after cloud-init provisioning.""" + _progress("Checking uv installation…") + r = _vm_ssh("bash -lc 'uv --version'", timeout=30) + assert r.returncode == 0, ( + f"uv not installed or not on PATH (rc={r.returncode}):\n" + f"stdout: {r.stdout[:500]}\nstderr: {r.stderr[:500]}" + ) + assert "uv" in r.stdout, f"Unexpected uv --version output: {r.stdout}" + + +def test_claude_code_installed(running_vm): + """Claude Code CLI should be installed and functional after cloud-init provisioning.""" + _progress("Checking Claude Code installation…") + r = _vm_ssh("bash -lc 'claude --version'", timeout=30) + if r.returncode != 0: + diag = _vm_ssh( + "bash -lc 'ls -la ~/.local/bin/claude 2>&1; " + "ls ~/.local/share/claude/versions/ 2>&1; " + "echo PATH=$PATH'", + timeout=10, + ) + assert False, ( + f"claude not installed or not on PATH (rc={r.returncode}):\n" + f"stderr: {r.stderr[:500]}\n" + f"diagnostics:\n{diag.stdout[:1000]}" + ) + output = (r.stdout + r.stderr).lower() + assert "claude" in output, ( + f"Unexpected claude --version output:\n" + f"stdout: {r.stdout!r}\nstderr: {r.stderr!r}" + ) + + def test_blocked_domain(running_vm): """Requests to domains not in filter.py's allowlist should be blocked with 403.""" result = _vm_ssh( From 416131769d1f146dfc26bf75bc87600a48fba1bc Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 22:07:03 +0000 Subject: [PATCH 08/11] Revert manual Claude Code install, use official installer The manual binary download was a workaround for OOM in 512M VMs. With the test VM bumped to 2G (next commit), the official installer works correctly. This reverts to `curl | bash` which handles version detection, binary placement, and symlink creation. This reverts the cloud-init portion of 67bef94. Co-Authored-By: Claude Opus 4.6 --- cloud-init/user-data | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/cloud-init/user-data b/cloud-init/user-data index 75ad64d..198c6ae 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -221,27 +221,8 @@ runcmd: # from /etc/profile.d/proxy.sh). --no-modify-path because proxy.sh # already adds ~/.local/bin to PATH. Binary lands in /home/vm/.local/bin/. - su - vm -c 'curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --no-modify-path' - # Install Claude Code CLI. The official install script downloads the - # binary then runs `claude install` to create the symlink — but that - # subcommand maps ~70 GB of virtual memory and gets OOM-killed in - # small VMs. Instead, download the binary directly and place it - # ourselves. - - | - su - vm -c ' - set -e - DOWNLOAD_BASE="https://downloads.claude.ai/claude-code-releases" - case "$(uname -m)" in - x86_64|amd64) platform="linux-x64" ;; - aarch64|arm64) platform="linux-arm64" ;; - *) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;; - esac - version=$(curl -fsSL "$DOWNLOAD_BASE/latest") - mkdir -p ~/.local/share/claude/versions ~/.local/bin - curl -fsSL -o ~/.local/share/claude/versions/"$version" \ - "$DOWNLOAD_BASE/$version/$platform/claude" - chmod +x ~/.local/share/claude/versions/"$version" - ln -sf ~/.local/share/claude/versions/"$version" ~/.local/bin/claude - ' + # Install Claude Code CLI. + - su - vm -c 'curl -fsSL https://claude.ai/install.sh | bash' # Propagate the host user's git identity into the VM so commits # made inside the guest have the correct author. The placeholders # are substituted by vm.py from `git config --global`; if the host From ea467c4e36b82e23b8f1d5063e1fb4786bb94698 Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Thu, 21 May 2026 22:07:10 +0000 Subject: [PATCH 09/11] Give test VM 2G RAM and improve Claude Code test diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 512M was not enough for the Claude Code binary — it maps ~70GB of virtual address space on startup and fails silently (rc=255) when the kernel denies the allocation. Bump to 2G so the official installer and runtime both work. Also collect file type, shared library, dmesg, and memory info when claude --version fails, so future failures are diagnosable without guesswork. Co-Authored-By: Claude Opus 4.6 --- tests/test_e2e.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 163fbda..a96814f 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -181,7 +181,7 @@ def running_vm(): # vm.py start runs mitmproxy in the background and QEMU in the foreground. # Both inherit our file handles, so their output lands in console.log. vm_proc = subprocess.Popen( - [sys.executable, str(VM_PY), "start", "--memory", "512M", + [sys.executable, str(VM_PY), "start", "--memory", "2G", "--ssh-port", str(TEST_SSH_PORT), "--proxy-port", str(TEST_PROXY_PORT), "--extra-user-data", str(REPO / "tests" / "nmap.yaml")], @@ -349,15 +349,20 @@ def test_claude_code_installed(running_vm): r = _vm_ssh("bash -lc 'claude --version'", timeout=30) if r.returncode != 0: diag = _vm_ssh( - "bash -lc 'ls -la ~/.local/bin/claude 2>&1; " - "ls ~/.local/share/claude/versions/ 2>&1; " - "echo PATH=$PATH'", + "bash -lc '" + "echo \"=== binary ===\"; ls -la ~/.local/bin/claude 2>&1; " + "echo \"=== versions ===\"; ls ~/.local/share/claude/versions/ 2>&1; " + "echo \"=== file ===\"; file $(readlink -f ~/.local/bin/claude) 2>&1; " + "echo \"=== ldd ===\"; ldd $(readlink -f ~/.local/bin/claude) 2>&1; " + "echo \"=== dmesg ===\"; sudo dmesg | tail -20 2>&1; " + "echo \"=== free ===\"; free -h 2>&1; " + "echo \"=== PATH ===\"; echo PATH=$PATH'", timeout=10, ) assert False, ( f"claude not installed or not on PATH (rc={r.returncode}):\n" f"stderr: {r.stderr[:500]}\n" - f"diagnostics:\n{diag.stdout[:1000]}" + f"diagnostics:\n{diag.stdout[:2000]}" ) output = (r.stdout + r.stderr).lower() assert "claude" in output, ( From dbe2018ab1917557682ab2bdebac848485561893 Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Fri, 22 May 2026 15:06:07 +0000 Subject: [PATCH 10/11] Use cpu=max under TCG and bypass claude install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the Claude Code install failure on Linux CI: 1. Change the TCG CPU model from qemu64 to max. The Claude Code x86_64 binary uses instructions that qemu64 doesn't emulate, causing an invalid-opcode trap (visible in dmesg). 2. Download the binary directly instead of using the official installer. The installer runs `claude install` which maps ~70 GB of virtual memory — even with cpu=max, this takes so long under TCG that cloud-init times out. Validated locally by reproducing the CI failure: - TCG + qemu64 + official installer → invalid opcode (FAIL) - TCG + max + official installer → cloud-init timeout (FAIL) - TCG + max + direct download → PASS Co-Authored-By: Claude Opus 4.6 --- cloud-init/user-data | 23 +++++++++++++++++++++-- vm.py | 4 ++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cloud-init/user-data b/cloud-init/user-data index 198c6ae..8d95e3c 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -221,8 +221,27 @@ runcmd: # from /etc/profile.d/proxy.sh). --no-modify-path because proxy.sh # already adds ~/.local/bin to PATH. Binary lands in /home/vm/.local/bin/. - su - vm -c 'curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --no-modify-path' - # Install Claude Code CLI. - - su - vm -c 'curl -fsSL https://claude.ai/install.sh | bash' + # Install Claude Code CLI. The official install script runs + # `claude install` after downloading, which maps ~70 GB of virtual + # memory. Under TCG emulation this either triggers an invalid-opcode + # trap (qemu64 lacks the required instructions) or takes so long that + # cloud-init times out. Download the binary directly instead. + - | + su - vm -c ' + set -e + DOWNLOAD_BASE="https://downloads.claude.ai/claude-code-releases" + case "$(uname -m)" in + x86_64|amd64) platform="linux-x64" ;; + aarch64|arm64) platform="linux-arm64" ;; + *) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;; + esac + version=$(curl -fsSL "$DOWNLOAD_BASE/latest") + mkdir -p ~/.local/share/claude/versions ~/.local/bin + curl -fsSL -o ~/.local/share/claude/versions/"$version" \ + "$DOWNLOAD_BASE/$version/$platform/claude" + chmod +x ~/.local/share/claude/versions/"$version" + ln -sf ~/.local/share/claude/versions/"$version" ~/.local/bin/claude + ' # Propagate the host user's git identity into the VM so commits # made inside the guest have the correct author. The placeholders # are substituted by vm.py from `git config --global`; if the host diff --git a/vm.py b/vm.py index 50d356a..1d2c930 100755 --- a/vm.py +++ b/vm.py @@ -166,7 +166,7 @@ def machine_args(self) -> list[str]: if self.arch == Arch.ARM64: cpu = "host" if self._accel == "hvf" else "cortex-a57" return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu] - cpu = "host" if self._accel == "hvf" else "qemu64" + cpu = "host" if self._accel == "hvf" else "max" return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu] def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]: @@ -200,7 +200,7 @@ def machine_args(self) -> list[str]: if self.arch == Arch.ARM64: cpu = "host" if self._accel == "kvm" else "cortex-a57" return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu] - cpu = "host" if self._accel == "kvm" else "qemu64" + cpu = "host" if self._accel == "kvm" else "max" return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu] def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]: From 5c35788be1c50523d4a0331b8ce617e65452d1c2 Mon Sep 17 00:00:00 2001 From: Phil Calvin Date: Fri, 22 May 2026 15:06:12 +0000 Subject: [PATCH 11/11] Document CI failure reproduction workflow in CLAUDE.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a section explaining how to reproduce CI failures locally by forcing TCG mode with QEMU_ACCEL=tcg, and the expected reproduce → fix → verify → full suite workflow. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index c849c9b..6f736f8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,6 +16,24 @@ uv run pytest tests/test_e2e.py -v -s The test boots the VM end-to-end (takes ~90s without KVM) and verifies `curl https://pypi.org` works through mitmproxy. Do not commit if this fails. +### Fixing CI failures + +When a test fails in CI but passes locally, **reproduce the failure locally +before applying a fix.** This VM has KVM, but CI may not — one known +divergence is the QEMU CPU model (`-cpu host` with KVM vs `-cpu max` with +TCG). To match CI's TCG environment: + +```bash +QEMU_ACCEL=tcg uv run pytest tests/test_e2e.py::test_that_failed -v -s +``` + +The workflow is: + +1. **Reproduce** — run the failing test under CI-like conditions and confirm it fails. +2. **Fix** — apply the change. +3. **Verify** — re-run under the same conditions and confirm it passes. +4. **Full suite** — run the complete test suite to check for regressions. + The full suite including the network isolation tests can take 5+ minutes under TCG emulation. TCG is slower than KVM but not *that* slow — if cloud-init status is unchanged for more than a minute, check the console log and process list rather than assuming it's just slow. A dead QEMU process or OOM kill is more likely than TCG being the bottleneck. Launch the test with `Bash` using `run_in_background: true`, then immediately attach a `Monitor` to tail the output file with a progress filter. This keeps the conversation unblocked while streaming results: