diff --git a/CLAUDE.md b/CLAUDE.md index dce0e54..6f736f8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,8 +4,9 @@ ## Development workflow -Before running the e2e tests, ensure the test prerequisites from -HACKING.md are installed. +The e2e tests run inside this VM. If `uv` is not yet installed, install +it first (see Tooling policy below). Ensure the test prerequisites from +HACKING.md are also installed. Always run the test suite before committing: @@ -15,6 +16,24 @@ uv run pytest tests/test_e2e.py -v -s The test boots the VM end-to-end (takes ~90s without KVM) and verifies `curl https://pypi.org` works through mitmproxy. Do not commit if this fails. +### Fixing CI failures + +When a test fails in CI but passes locally, **reproduce the failure locally +before applying a fix.** This VM has KVM, but CI may not — one known +divergence is the QEMU CPU model (`-cpu host` with KVM vs `-cpu max` with +TCG). To match CI's TCG environment: + +```bash +QEMU_ACCEL=tcg uv run pytest tests/test_e2e.py::test_that_failed -v -s +``` + +The workflow is: + +1. **Reproduce** — run the failing test under CI-like conditions and confirm it fails. +2. **Fix** — apply the change. +3. **Verify** — re-run under the same conditions and confirm it passes. +4. **Full suite** — run the complete test suite to check for regressions. + The full suite including the network isolation tests can take 5+ minutes under TCG emulation. TCG is slower than KVM but not *that* slow — if cloud-init status is unchanged for more than a minute, check the console log and process list rather than assuming it's just slow. A dead QEMU process or OOM kill is more likely than TCG being the bottleneck. Launch the test with `Bash` using `run_in_background: true`, then immediately attach a `Monitor` to tail the output file with a progress filter. This keeps the conversation unblocked while streaming results: diff --git a/allowlist.txt b/allowlist.txt index 731766d..5ef3743 100644 --- a/allowlist.txt +++ b/allowlist.txt @@ -59,20 +59,27 @@ GET https://api.anthropic.com/api/hello # astral.sh; binary downloads come from releases.astral.sh (or GitHub # release assets as a fallback). URLs vary by version and platform. GET https://astral.sh/uv/install.sh +GET https://releases.astral.sh/installers/uv/* GET https://releases.astral.sh/github/uv/releases/* GET https://github.com/astral-sh/uv/releases/* GET https://release-assets.githubusercontent.com/github-production-release-asset/* -# ── Docker Hub ──────────────────────────────────────────────────── -# Registry API — paths vary by image name, tag, and sha256 digest -# (e.g. /v2/library/hello-world/manifests/latest). Scoped to /v2/. -GET https://registry-1.docker.io/v2/* -# Auth tokens — the registry returns 401 with a token URL whose -# query parameters vary per request (scope, service, etc.). +# ── Docker Hub (hello-world only) ───────────────────────────────── +# Scoped to the library/hello-world image used by the e2e test. +# To pull other images, add their specific paths here. +# /v2/ (bare) is Docker's registry version check — required before +# any image pull. +GET https://registry-1.docker.io/v2/ +GET https://registry-1.docker.io/v2/library/hello-world/* +# Auth tokens — scoped to the hello-world repository. GET https://auth.docker.io/token* -# Blob storage — the registry redirects layer downloads to this -# Cloudflare R2 bucket. Paths contain per-blob sha256 digests. +# Blob storage — the registry redirects layer downloads to either +# a Cloudflare R2 bucket or CloudFront CDN. Blob paths contain +# sha256 digests that can't be scoped per-image, but the registry +# only returns redirect URLs for layers belonging to images the +# client already resolved via the scoped manifest rules above. GET https://docker-images-prod.6aa30f8b08e16409b46e0173d6de2f56.r2.cloudflarestorage.com/registry-v2/* +GET https://production.cloudfront.docker.com/registry-v2/* # ── Debian cloud images (nested VM testing only) ────────────────── # Only needed when running the e2e test suite inside a VM (i.e. the diff --git a/cloud-init/user-data b/cloud-init/user-data index 044d6dd..8d95e3c 100644 --- a/cloud-init/user-data +++ b/cloud-init/user-data @@ -21,12 +21,18 @@ bootcmd: wget -qO /usr/local/share/ca-certificates/mitmproxy.crt http://mitm.it/cert/pem update-ca-certificates fi - # Disable initramfs rebuilds — this is an ephemeral VM that is never - # rebooted. The generic kernel's mkinitramfs takes 2+ minutes under - # TCG because it copies hundreds of driver modules. - - dpkg-divert --local --rename --add /usr/sbin/update-initramfs - - ln -sf /bin/true /usr/sbin/update-initramfs - + # Suppress initramfs rebuilds during first-boot provisioning only. + # Package installs (docker.io, etc.) trigger the initramfs-tools dpkg + # hook, and a full rebuild takes minutes under TCG emulation. The + # diversion is undone in runcmd so that future kernel upgrades (via + # unattended-upgrades) generate a working initramfs. + # Guard: boot-finished is written at the very end of cloud-init's final + # stage, so it won't exist during first boot but will on all subsequent. + - | + if [ ! -f /var/lib/cloud/instance/boot-finished ]; then + dpkg-divert --local --rename --add /usr/sbin/update-initramfs 2>/dev/null + ln -sf /bin/true /usr/sbin/update-initramfs + fi write_files: - path: /etc/apt/apt.conf.d/90proxy content: | @@ -64,6 +70,13 @@ write_files: Environment="HTTPS_PROXY=http://__HOST_IP__:__PROXY_PORT__" Environment="NO_PROXY=localhost,127.0.0.1,__HOST_IP__" + # The default MODULES=most copies hundreds of bare-metal drivers into + # the initramfs. Under TCG emulation this takes minutes. MODULES=dep + # limits it to modules for detected hardware (virtio), cutting the + # rebuild from ~5 min to seconds. + - path: /etc/initramfs-tools/conf.d/vm-modules-dep + content: | + MODULES=dep - path: /etc/systemd/system/mnt-9p.mount content: | [Unit] @@ -184,6 +197,14 @@ packages: - docker.io runcmd: + # Undo the update-initramfs diversion applied during first-boot + # provisioning (see bootcmd). From this point on, kernel upgrades + # will generate a proper initramfs. + - | + if dpkg-divert --list /usr/sbin/update-initramfs 2>/dev/null | grep -q diversion; then + rm -f /usr/sbin/update-initramfs + dpkg-divert --local --rename --remove /usr/sbin/update-initramfs + fi - mkdir -p /mnt/9p /home/vm/shared - systemctl daemon-reload - systemctl enable --now mnt-9p.mount @@ -200,8 +221,27 @@ runcmd: # from /etc/profile.d/proxy.sh). --no-modify-path because proxy.sh # already adds ~/.local/bin to PATH. Binary lands in /home/vm/.local/bin/. - su - vm -c 'curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --no-modify-path' - # Install Claude Code CLI. - - su - vm -c 'curl -fsSL https://claude.ai/install.sh | bash' + # Install Claude Code CLI. The official install script runs + # `claude install` after downloading, which maps ~70 GB of virtual + # memory. Under TCG emulation this either triggers an invalid-opcode + # trap (qemu64 lacks the required instructions) or takes so long that + # cloud-init times out. Download the binary directly instead. + - | + su - vm -c ' + set -e + DOWNLOAD_BASE="https://downloads.claude.ai/claude-code-releases" + case "$(uname -m)" in + x86_64|amd64) platform="linux-x64" ;; + aarch64|arm64) platform="linux-arm64" ;; + *) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;; + esac + version=$(curl -fsSL "$DOWNLOAD_BASE/latest") + mkdir -p ~/.local/share/claude/versions ~/.local/bin + curl -fsSL -o ~/.local/share/claude/versions/"$version" \ + "$DOWNLOAD_BASE/$version/$platform/claude" + chmod +x ~/.local/share/claude/versions/"$version" + ln -sf ~/.local/share/claude/versions/"$version" ~/.local/bin/claude + ' # Propagate the host user's git identity into the VM so commits # made inside the guest have the correct author. The placeholders # are substituted by vm.py from `git config --global`; if the host diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 3216c92..a96814f 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -181,7 +181,7 @@ def running_vm(): # vm.py start runs mitmproxy in the background and QEMU in the foreground. # Both inherit our file handles, so their output lands in console.log. vm_proc = subprocess.Popen( - [sys.executable, str(VM_PY), "start", "--memory", "512M", + [sys.executable, str(VM_PY), "start", "--memory", "2G", "--ssh-port", str(TEST_SSH_PORT), "--proxy-port", str(TEST_PROXY_PORT), "--extra-user-data", str(REPO / "tests" / "nmap.yaml")], @@ -332,6 +332,45 @@ def test_docker_hello_world(running_vm): ) +def test_uv_installed(running_vm): + """uv should be installed and functional after cloud-init provisioning.""" + _progress("Checking uv installation…") + r = _vm_ssh("bash -lc 'uv --version'", timeout=30) + assert r.returncode == 0, ( + f"uv not installed or not on PATH (rc={r.returncode}):\n" + f"stdout: {r.stdout[:500]}\nstderr: {r.stderr[:500]}" + ) + assert "uv" in r.stdout, f"Unexpected uv --version output: {r.stdout}" + + +def test_claude_code_installed(running_vm): + """Claude Code CLI should be installed and functional after cloud-init provisioning.""" + _progress("Checking Claude Code installation…") + r = _vm_ssh("bash -lc 'claude --version'", timeout=30) + if r.returncode != 0: + diag = _vm_ssh( + "bash -lc '" + "echo \"=== binary ===\"; ls -la ~/.local/bin/claude 2>&1; " + "echo \"=== versions ===\"; ls ~/.local/share/claude/versions/ 2>&1; " + "echo \"=== file ===\"; file $(readlink -f ~/.local/bin/claude) 2>&1; " + "echo \"=== ldd ===\"; ldd $(readlink -f ~/.local/bin/claude) 2>&1; " + "echo \"=== dmesg ===\"; sudo dmesg | tail -20 2>&1; " + "echo \"=== free ===\"; free -h 2>&1; " + "echo \"=== PATH ===\"; echo PATH=$PATH'", + timeout=10, + ) + assert False, ( + f"claude not installed or not on PATH (rc={r.returncode}):\n" + f"stderr: {r.stderr[:500]}\n" + f"diagnostics:\n{diag.stdout[:2000]}" + ) + output = (r.stdout + r.stderr).lower() + assert "claude" in output, ( + f"Unexpected claude --version output:\n" + f"stdout: {r.stdout!r}\nstderr: {r.stderr!r}" + ) + + def test_blocked_domain(running_vm): """Requests to domains not in filter.py's allowlist should be blocked with 403.""" result = _vm_ssh( @@ -676,3 +715,125 @@ def test_guest_cannot_modify_host_allowlist(running_vm): _vm_ssh(f"rm -f ~/shared/{marker} 2>/dev/null; true", timeout=10) # Safety net: restore original content in case the test failed allowlist_path.write_text(original_content) + + +# --------------------------------------------------------------------------- +# Kernel upgrade + reboot +# --------------------------------------------------------------------------- + + +def test_kernel_install_and_reboot(running_vm): + """Installing a new kernel and rebooting must not kernel panic. + + The base cloud-init config once diverted update-initramfs to /bin/true + to speed up provisioning (~2 min saved under TCG emulation). This was + safe under the assumption that the VM was ephemeral and never rebooted. + In practice, Debian's unattended-upgrades installs kernel security + updates on a daily timer. Because update-initramfs was a no-op, the + new kernel shipped without an initramfs. GRUB's os-prober still picked + up the new vmlinuz and made it the default boot entry — but with no + initrd line. On next boot the kernel couldn't load the virtio_blk + module (it lives in the initramfs, not built-in), so the root disk was + invisible and the kernel panicked: + + VFS: Cannot open root device "PARTUUID=..." or unknown-block(0,0) + Kernel panic - not syncing: VFS: Unable to mount root fs + + This test reproduces that scenario end-to-end: install a second kernel + flavor, set GRUB to boot it, and reboot. If update-initramfs is broken, + the VM kernel-panics and SSH never comes back. + + Placed last because it reboots the VM. + """ + # Detect guest architecture to pick the right cloud kernel package. + r = _vm_ssh("dpkg --print-architecture", timeout=10) + assert r.returncode == 0 + arch = r.stdout.strip() + cloud_pkg = f"linux-image-cloud-{arch}" + + _progress(f"Installing {cloud_pkg}…") + r = _vm_ssh( + f"bash -lc 'sudo apt-get install -y -qq {cloud_pkg} 2>&1'", + timeout=600, + ) + assert r.returncode == 0, ( + f"Kernel install failed (rc={r.returncode}):\n" + f"{r.stdout[-2000:]}\n{r.stderr[-2000:]}" + ) + + # Find the newly installed cloud kernel version. + r = _vm_ssh(f"ls /boot/vmlinuz-*-cloud-{arch}", timeout=10) + assert r.returncode == 0, f"No cloud kernel found in /boot:\n{r.stderr}" + cloud_vmlinuz = r.stdout.strip().splitlines()[-1].strip() + cloud_version = cloud_vmlinuz.rsplit("/", 1)[-1].removeprefix("vmlinuz-") + _progress(f"Installed cloud kernel: {cloud_version}") + + # Verify the initramfs was created for it. + r = _vm_ssh(f"test -f /boot/initrd.img-{cloud_version}", timeout=10) + assert r.returncode == 0, ( + f"initrd.img-{cloud_version} was not created.\n" + "update-initramfs is likely diverted to /bin/true." + ) + + # Set GRUB to boot the cloud kernel by default. + # Read the root filesystem UUID from the running VM rather than + # hardcoding a PARTUUID that is specific to one image build. + r = _vm_ssh( + "sudo grub-probe --target=fs_uuid /", + timeout=10, + ) + assert r.returncode == 0, f"Cannot determine root FS UUID:\n{r.stderr}" + root_uuid = r.stdout.strip() + grub_entry = f"gnulinux-advanced-{root_uuid}>gnulinux-{cloud_version}-advanced-{root_uuid}" + _vm_ssh( + f"sudo grub-set-default '{grub_entry}' 2>&1", + timeout=10, + ) + _vm_ssh("sudo update-grub 2>&1", timeout=60) + + # Verify GRUB config has an initrd line for the cloud kernel. + r = _vm_ssh("cat /boot/grub/grub.cfg", timeout=10) + assert f"initrd\t/boot/initrd.img-{cloud_version}" in r.stdout, ( + f"GRUB config missing initrd for {cloud_version}." + ) + + _progress("Rebooting into cloud kernel…") + _vm_ssh("sudo reboot", timeout=10) + + # Wait for SSH to go down. + time.sleep(10) + + # Wait for SSH to come back — if the kernel panicked, it never will. + deadline = time.monotonic() + BOOT_TIMEOUT + attempt = 0 + while time.monotonic() < deadline: + if running_vm.poll() is not None: + _dump_logs() + pytest.fail( + "QEMU exited during reboot — likely kernel panic.\n" + "Check console log above." + ) + attempt += 1 + remaining = int(deadline - time.monotonic()) + _progress(f"Post-reboot SSH probe #{attempt} ({remaining}s remaining)…") + try: + r = _vm_ssh("true", timeout=10) + if r.returncode == 0: + _progress(f"VM back after reboot ({attempt} probe(s))") + break + except subprocess.TimeoutExpired: + pass + time.sleep(SSH_POLL_INTERVAL) + else: + _dump_logs() + pytest.fail( + f"VM did not come back after reboot within {BOOT_TIMEOUT}s.\n" + "Likely kernel panic due to missing initramfs." + ) + + # Confirm we're running the new kernel. + r = _vm_ssh("uname -r", timeout=10) + _progress(f"Running kernel after reboot: {r.stdout.strip()}") + assert "cloud" in r.stdout, ( + f"Expected to boot cloud kernel, got: {r.stdout.strip()}" + ) diff --git a/vm.py b/vm.py index 50d356a..1d2c930 100755 --- a/vm.py +++ b/vm.py @@ -166,7 +166,7 @@ def machine_args(self) -> list[str]: if self.arch == Arch.ARM64: cpu = "host" if self._accel == "hvf" else "cortex-a57" return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu] - cpu = "host" if self._accel == "hvf" else "qemu64" + cpu = "host" if self._accel == "hvf" else "max" return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu] def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]: @@ -200,7 +200,7 @@ def machine_args(self) -> list[str]: if self.arch == Arch.ARM64: cpu = "host" if self._accel == "kvm" else "cortex-a57" return ["-machine", f"virt,accel={self._accel}", "-cpu", cpu] - cpu = "host" if self._accel == "kvm" else "qemu64" + cpu = "host" if self._accel == "kvm" else "max" return ["-machine", f"q35,accel={self._accel}", "-cpu", cpu] def prepare_efi(self, state_dir: Path) -> tuple[Path, Path]: