diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 408ef85c7..16158c0dc 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -63,6 +63,7 @@ Use gateway metadata, deployment values, or the user's setup notes to identify t docker info docker ps --filter name=openshell docker logs --tail=200 +docker run --rm --entrypoint /openshell-sandbox "${OPENSHELL_DOCKER_SUPERVISOR_IMAGE:-ghcr.io/nvidia/openshell/supervisor:latest}" --version openshell status ``` @@ -71,6 +72,7 @@ Common findings: - Docker daemon unavailable: start Docker Desktop or Docker Engine. - Gateway process stopped: inspect exit status and logs. - Sandbox image missing or pull denied: verify image reference and registry credentials. +- Docker driver cannot initialize because it cannot find `openshell-sandbox`: verify `OPENSHELL_DOCKER_SUPERVISOR_BIN`, the sibling binary next to `openshell-gateway`, or the configured supervisor image contains `/openshell-sandbox`. - Sandbox never registers: check gateway logs and supervisor callback endpoint. For source checkout development, restart the local gateway with: diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 6c3807858..42d991b60 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -155,6 +155,7 @@ jobs: component: ${{ needs.resolve.outputs.binary_component }} arch: ${{ matrix.arch }} cargo-version: ${{ inputs['cargo-version'] }} + image-tag: ${{ needs.resolve.outputs.image_tag_base }} checkout-ref: ${{ inputs['checkout-ref'] }} features: openshell-core/dev-settings artifact-name: ${{ needs.resolve.outputs.artifact_prefix }}-linux-${{ matrix.arch }} @@ -238,7 +239,6 @@ jobs: --cache-to "type=gha,mode=max,scope=${{ inputs.component }}-${{ matrix.arch }}" - name: Smoke check ${{ inputs.component }} image - if: ${{ !inputs.push }} run: | set -euo pipefail image="${IMAGE_REGISTRY}/${{ inputs.component }}:${IMAGE_TAG}" @@ -249,7 +249,7 @@ jobs: grep -q '^openshell-gateway ' <<<"$output" ;; supervisor) - output="$(docker run --rm --platform "${{ matrix.platform }}" "$image" --version)" + output="$(docker run --rm --platform "${{ matrix.platform }}" --entrypoint /openshell-sandbox "$image" --version)" echo "$output" grep -q '^openshell-sandbox ' <<<"$output" ;; diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml index 0385930bd..7b9bb2f92 100644 --- a/.github/workflows/release-dev.yml +++ b/.github/workflows/release-dev.yml @@ -432,6 +432,8 @@ jobs: sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ needs.compute-versions.outputs.cargo_version }}"'"/}' Cargo.toml - name: Build ${{ matrix.target }} + env: + OPENSHELL_IMAGE_TAG: ${{ github.sha }} run: | set -euo pipefail mise x -- cargo build --release --target ${{ matrix.target }} -p openshell-server diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml index 97c8422a2..60966b3b6 100644 --- a/.github/workflows/release-tag.yml +++ b/.github/workflows/release-tag.yml @@ -466,6 +466,8 @@ jobs: sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ needs.compute-versions.outputs.cargo_version }}"'"/}' Cargo.toml - name: Build ${{ matrix.target }} + env: + OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.source_sha }} run: | set -euo pipefail mise x -- cargo build --release --target ${{ matrix.target }} -p openshell-server diff --git a/.github/workflows/shadow-rust-native-build.yml b/.github/workflows/shadow-rust-native-build.yml index b943a1ddb..7113d260f 100644 --- a/.github/workflows/shadow-rust-native-build.yml +++ b/.github/workflows/shadow-rust-native-build.yml @@ -42,6 +42,11 @@ on: required: false type: string default: "" + image-tag: + description: "Supervisor image tag to bake into gateway binaries" + required: false + type: string + default: "" workflow_dispatch: inputs: component: @@ -85,6 +90,11 @@ on: required: false type: string default: "" + image-tag: + description: "Supervisor image tag to bake into gateway binaries" + required: false + type: string + default: "" permissions: contents: read @@ -207,6 +217,7 @@ jobs: # Preserve the release-codegen setting used by the old Dockerfile # Rust build path so image artifacts keep the same release profile. CARGO_PROFILE_RELEASE_CODEGEN_UNITS: "1" + OPENSHELL_IMAGE_TAG: ${{ inputs['image-tag'] }} run: | set -euo pipefail args=( diff --git a/architecture/build.md b/architecture/build.md index baf44eba9..266575efb 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -12,7 +12,7 @@ OpenShell builds these main artifacts: |---|---| | Gateway binary | `crates/openshell-server` | | CLI package and Python SDK | `python/openshell` plus Rust binaries where packaged | -| Gateway container image | `deploy/docker/Dockerfile.images` | +| Gateway and supervisor container images | `deploy/docker/Dockerfile.images` | | Helm chart | `deploy/helm/openshell` | | VM driver/runtime assets | `crates/openshell-driver-vm` | | Published docs site | `docs/` rendered by Fern config in `fern/` | @@ -25,6 +25,8 @@ The Docker image pipeline stages prebuilt Rust binaries, then builds container images from `deploy/docker/Dockerfile.images`. CI builds native artifacts on the target architecture, stages them under `deploy/docker/.build/`, and then uses Buildx to publish per-architecture images and multi-architecture tags. +Gateway image builds bake the corresponding supervisor image tag into the +gateway binary so Docker sandboxes do not depend on `:latest` by default. Local image work should use `mise` tasks rather than direct Docker commands so the same staging and tagging assumptions are used locally and in CI. diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 095b7d020..33917a28f 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -38,7 +38,7 @@ The supervisor must be available inside each sandbox workload: | Runtime | Delivery model | |---|---| -| Docker | Bind-mounted or extracted supervisor binary configured by the gateway. | +| Docker | Bind-mounted local supervisor binary, or a binary extracted from the configured supervisor image. | | Podman | Read-only OCI image volume containing the supervisor binary. | | Kubernetes | Sandbox pod image or pod template configuration. | | VM | Embedded in the guest rootfs bundle. | diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 7bc8048b2..99b6e1385 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -34,6 +34,21 @@ contract: The agent child process does not retain these supervisor privileges. +## Supervisor Binary Resolution + +The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into +each sandbox container. Resolution order is: + +1. `--docker-supervisor-bin` / `OPENSHELL_DOCKER_SUPERVISOR_BIN`. +2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. +3. A local Linux cargo target build for the Docker daemon architecture. +4. `--docker-supervisor-image` / `OPENSHELL_DOCKER_SUPERVISOR_IMAGE`, or the + release-matched default supervisor image, extracting `/openshell-sandbox`. + +Release and Docker-image gateway builds bake the matching supervisor image tag +into the binary at compile time. The default Docker supervisor image is not +`:latest` unless a custom build explicitly sets that tag. + ## Callback and TLS `OPENSHELL_ENDPOINT` is injected from the gateway's configured gRPC endpoint diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 0eaef3bce..a864a3eb6 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -1759,7 +1759,7 @@ async fn extract_supervisor_binary_bytes(docker: &Docker, image: &str) -> CoreRe ), ContainerCreateBody { image: Some(image.to_string()), - entrypoint: Some(vec!["/openshell-sandbox".to_string()]), + entrypoint: Some(vec![SUPERVISOR_IMAGE_BINARY_PATH.to_string()]), cmd: Some(Vec::new()), ..Default::default() }, diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index e2d06044d..6c855f63e 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -688,19 +688,19 @@ fn supervisor_volume_mount() -> serde_json::Value { /// Path of the supervisor binary inside the supervisor image. /// -/// The supervisor image places the binary at the filesystem root and ships -/// nothing else. We invoke it directly — there is no shell, `cp`, or PATH -/// resolution available inside the image. +/// The supervisor image places the binary at the filesystem root. We invoke +/// it directly so the init path does not depend on shell utilities or PATH +/// resolution inside the image. const SUPERVISOR_IMAGE_BINARY_PATH: &str = "/openshell-sandbox"; /// Build the init container that copies the supervisor binary into the emptyDir. /// -/// The supervisor image contains only the supervisor binary at -/// `/openshell-sandbox`. We invoke that binary with the `copy-self` -/// subcommand so it copies itself into the shared emptyDir volume, where the -/// agent container then executes it from a fixed, writable path. This pattern -/// (binary self-copy) avoids requiring `sh`/`cp` in the supervisor image and -/// mirrors the approach used by argoexec's emissary executor. +/// The supervisor image contains the supervisor binary at `/openshell-sandbox`. +/// We invoke that binary with the `copy-self` subcommand so it copies itself +/// into the shared emptyDir volume, where the agent container then executes it +/// from a fixed, writable path. This pattern (binary self-copy) avoids requiring +/// `sh`/`cp` in the supervisor image and mirrors the approach used by argoexec's +/// emissary executor. fn supervisor_init_container( supervisor_image: &str, supervisor_image_pull_policy: &str, @@ -1559,8 +1559,8 @@ mod tests { assert_eq!(init_containers[0]["image"], "supervisor-image:latest"); assert_eq!(init_containers[0]["imagePullPolicy"], "IfNotPresent"); - // The supervisor image ships only the binary (no shell). The init - // container must invoke the binary directly with `copy-self `. + // The init container must invoke the binary directly with + // `copy-self ` rather than depending on shell utilities. let init_command = init_containers[0]["command"] .as_array() .expect("init container command should be set"); @@ -1573,7 +1573,7 @@ mod tests { ); assert!( !init_command.iter().any(|v| v == "sh"), - "init container must not depend on a shell (supervisor image ships only the binary)" + "init container must not depend on a shell" ); // Agent container command should be overridden to the emptyDir path diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 20d455663..6ae1bd5fe 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -19,9 +19,9 @@ use openshell_sandbox::run_sandbox; /// Subcommand name used to self-copy the supervisor binary into a shared volume. /// -/// The supervisor image only ships the binary itself, so init containers -/// cannot rely on `sh`/`cp` to copy the binary out. Invoking the binary itself -/// with this argument performs the copy in pure Rust. +/// Init containers invoke the binary directly instead of relying on `sh`/`cp` +/// to copy the binary out. Invoking the binary itself with this argument +/// performs the copy in pure Rust. const COPY_SELF_SUBCOMMAND: &str = "copy-self"; /// `OpenShell` Sandbox - process isolation and monitoring. @@ -148,9 +148,8 @@ fn copy_self(dest: &str) -> Result<()> { fn main() -> Result<()> { // Handle `copy-self ` before clap so it works without any of the - // sandbox flags. The supervisor image only ships the binary itself, and - // Kubernetes init containers invoke this path to seed an emptyDir volume - // that the agent container then executes from. + // sandbox flags. Kubernetes init containers invoke this path to seed an + // emptyDir volume that the agent container then executes from. let raw_args: Vec = std::env::args().collect(); if raw_args.get(1).map(String::as_str) == Some(COPY_SELF_SUBCOMMAND) { let dest = raw_args.get(2).ok_or_else(|| { diff --git a/tasks/docker.toml b/tasks/docker.toml index a58fdcf86..502b2363c 100644 --- a/tasks/docker.toml +++ b/tasks/docker.toml @@ -27,7 +27,7 @@ run = "tasks/scripts/docker-build-image.sh gateway" hide = true ["build:docker:supervisor"] -description = "Build the supervisor image (FROM scratch, binary only)" +description = "Build the supervisor image" run = "tasks/scripts/docker-build-image.sh supervisor" hide = true