From 16b25c152b11094dc6cc8bc014261bd134cc940e Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 7 May 2026 22:07:42 +0200 Subject: [PATCH 1/4] feat(gpu): honor device IDs in Docker and Podman --- crates/openshell-core/src/gpu.rs | 48 ++++++++++++++ crates/openshell-core/src/lib.rs | 1 + crates/openshell-driver-docker/README.md | 2 +- crates/openshell-driver-docker/src/lib.rs | 28 +++++---- crates/openshell-driver-docker/src/tests.rs | 26 +++++++- crates/openshell-driver-podman/README.md | 1 + .../openshell-driver-podman/src/container.rs | 63 ++++++++++++++++--- crates/openshell-driver-podman/src/driver.rs | 4 ++ 8 files changed, 151 insertions(+), 22 deletions(-) create mode 100644 crates/openshell-core/src/gpu.rs diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs new file mode 100644 index 000000000..5df8702ed --- /dev/null +++ b/crates/openshell-core/src/gpu.rs @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Shared GPU request helpers. + +use crate::config::CDI_GPU_DEVICE_ALL; + +/// Resolve the existing GPU request fields into CDI device identifiers. +/// +/// `None` means no GPU was requested. A GPU request with no explicit device +/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes +/// through unchanged. +#[must_use] +pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option> { + gpu.then(|| { + if gpu_device.is_empty() { + vec![CDI_GPU_DEVICE_ALL.to_string()] + } else { + vec![gpu_device.to_string()] + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cdi_gpu_device_ids_returns_none_when_absent() { + assert_eq!(cdi_gpu_device_ids(false, ""), None); + } + + #[test] + fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() { + assert_eq!( + cdi_gpu_device_ids(true, ""), + Some(vec![CDI_GPU_DEVICE_ALL.to_string()]) + ); + } + + #[test] + fn cdi_gpu_device_ids_passes_explicit_device_id_through() { + assert_eq!( + cdi_gpu_device_ids(true, "nvidia.com/gpu=0"), + Some(vec!["nvidia.com/gpu=0".to_string()]) + ); + } +} diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index a4a1ea822..893b01f5f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -12,6 +12,7 @@ pub mod config; pub mod error; pub mod forward; +pub mod gpu; pub mod image; pub mod inference; pub mod metadata; diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 7bc8048b2..2c02eaa5e 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -30,7 +30,7 @@ contract: | `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. | | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. | | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. | -| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. | +| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. | The agent child process does not retain these supervisor privileges. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 0eaef3bce..4d2fb9eeb 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -18,9 +18,8 @@ use bollard::query_parameters::{ }; use bytes::Bytes; use futures::{Stream, StreamExt}; -use openshell_core::config::{ - CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS, -}; +use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS}; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, @@ -309,11 +308,7 @@ impl DockerComputeDriver { "docker sandboxes require a template image", )); } - if spec.gpu && !config.supports_gpu { - return Err(Status::failed_precondition( - "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", - )); - } + Self::validate_gpu_request(spec.gpu, config.supports_gpu)?; if !template.agent_socket_path.trim().is_empty() { return Err(Status::failed_precondition( "docker compute driver does not support template.agent_socket_path", @@ -333,6 +328,15 @@ impl DockerComputeDriver { Ok(()) } + fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> { + if gpu && !supports_gpu { + return Err(Status::failed_precondition( + "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", + )); + } + Ok(()) + } + async fn get_sandbox_snapshot( &self, sandbox_id: &str, @@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig .collect() } -fn docker_gpu_device_requests(gpu: bool) -> Option> { - gpu.then(|| { +fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option> { + cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| { vec![DeviceRequest { driver: Some("cdi".to_string()), - device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]), + device_ids: Some(device_ids), ..Default::default() }] }) @@ -996,7 +1000,7 @@ fn build_container_create_body( host_config: Some(HostConfig { nano_cpus: resource_limits.nano_cpus, memory: resource_limits.memory_bytes, - device_requests: docker_gpu_device_requests(spec.gpu), + device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device), mounts: Some(build_mounts(config)), restart_policy: Some(RestartPolicy { name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index e41f2688e..9bda6da82 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use super::*; -use openshell_core::config::DEFAULT_SERVER_PORT; +use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT}; use openshell_core::proto::compute::v1::{ DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, }; @@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() { ); } +#[test] +fn build_container_create_body_passes_explicit_cdi_device_id_through() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.gpu = true; + spec.gpu_device = "nvidia.com/gpu=0".to_string(); + + let create_body = build_container_create_body(&sandbox, &config).unwrap(); + let request = create_body + .host_config + .as_ref() + .and_then(|host_config| host_config.device_requests.as_ref()) + .and_then(|requests| requests.first()) + .expect("GPU request should add a Docker device request"); + + assert_eq!(request.driver.as_deref(), Some("cdi")); + assert_eq!( + request.device_ids.as_ref().unwrap(), + &vec!["nvidia.com/gpu=0".to_string()] + ); +} + #[test] fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() { // Regression test: `delete_sandbox` (and the other identifier-keyed diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index d853bb5ea..5b88010e4 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields: | `no_new_privileges` | `true` | Prevents privilege escalation after exec. | | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. | | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. | +| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. | The restricted agent child does not retain these supervisor privileges. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 3c5df292f..5b9b0d735 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -4,7 +4,7 @@ //! Container spec construction for the Podman driver. use crate::config::PodmanComputeConfig; -use openshell_core::config::CDI_GPU_DEVICE_ALL; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::DriverSandbox; use serde::Serialize; use serde_json::Value; @@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits { /// Build CDI GPU device list if GPU is requested. fn build_devices(sandbox: &DriverSandbox) -> Option> { - if sandbox.spec.as_ref().is_some_and(|s| s.gpu) { - Some(vec![LinuxDevice { - path: CDI_GPU_DEVICE_ALL.into(), - }]) - } else { - None - } + let spec = sandbox.spec.as_ref()?; + cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| { + device_ids + .into_iter() + .map(|path| LinuxDevice { path }) + .collect() + }) } /// Build the Podman container creation JSON spec. @@ -687,6 +687,53 @@ mod tests { assert_eq!(short_id("short"), "short"); } + #[test] + fn container_spec_omits_devices_without_gpu_request() { + let sandbox = test_sandbox("test-id", "test-name"); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert!(spec.get("devices").is_none()); + } + + #[test] + fn container_spec_maps_empty_gpu_request_to_all_cdi_device() { + use openshell_core::config::CDI_GPU_DEVICE_ALL; + use openshell_core::proto::compute::v1::DriverSandboxSpec; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: true, + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some(CDI_GPU_DEVICE_ALL) + ); + } + + #[test] + fn container_spec_passes_explicit_cdi_device_id_through() { + use openshell_core::proto::compute::v1::DriverSandboxSpec; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: true, + gpu_device: "nvidia.com/gpu=0".to_string(), + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some("nvidia.com/gpu=0") + ); + } + #[test] fn container_spec_includes_required_capabilities() { let sandbox = test_sandbox("test-id", "test-name"); diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index ad4d7a192..f78c5c730 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -199,6 +199,10 @@ impl PodmanComputeDriver { sandbox: &DriverSandbox, ) -> Result<(), ComputeDriverError> { let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu); + Self::validate_gpu_request(gpu_requested) + } + + fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> { if gpu_requested && !Self::has_gpu_capacity() { return Err(ComputeDriverError::Precondition( "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(), From 12712b4bc6a5198ca39885eb056e3aaab3b91f1f Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 7 May 2026 23:49:17 +0200 Subject: [PATCH 2/4] test(gpu): add Docker and Podman device selection e2e --- TESTING.md | 6 + crates/openshell-cli/src/main.rs | 4 +- e2e/rust/Cargo.toml | 15 +- e2e/rust/e2e-podman.sh | 15 +- e2e/rust/tests/docker_gpu.rs | 36 ---- e2e/rust/tests/gpu_device_selection.rs | 230 +++++++++++++++++++++++++ e2e/with-podman-gateway.sh | 1 + tasks/test.toml | 7 +- 8 files changed, 266 insertions(+), 48 deletions(-) delete mode 100644 e2e/rust/tests/docker_gpu.rs create mode 100644 e2e/rust/tests/gpu_device_selection.rs diff --git a/TESTING.md b/TESTING.md index 49c9b781a..bfcb5ea8a 100644 --- a/TESTING.md +++ b/TESTING.md @@ -151,6 +151,12 @@ Suites: - Docker suite (`--features e2e-docker`) - common suite plus Docker-only coverage such as Dockerfile image builds, Docker preflight checks, and managed Docker gateway resume. - Docker GPU suite (`--features e2e-docker-gpu`) - Docker suite plus GPU sandbox smoke coverage. +GPU device-selection tests compare OpenShell sandboxes against a plain Docker or +Podman container that requests `--device nvidia.com/gpu=all`. The probe image +defaults to the image used by the `gateway` stage in +`deploy/docker/Dockerfile.images`; set `OPENSHELL_E2E_GPU_PROBE_IMAGE` to +override it. + Run the Docker-backed Rust CLI e2e suite: ```shell diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index cd14568ef..7136e774f 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1061,8 +1061,8 @@ enum SandboxCommands { #[arg(long)] gpu: bool, - /// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1"). - /// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned. + /// Target GPUs by CDI device ID, for example "nvidia.com/gpu=0" or "nvidia.com/gpu=all". + /// Only valid with --gpu. When omitted with --gpu, all GPUs are requested. #[arg(long, requires = "gpu")] gpu_device: Option, diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 57bc1ff68..2357d3369 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -18,18 +18,16 @@ publish = false [features] e2e = [] e2e-docker = ["e2e"] -e2e-docker-gpu = ["e2e-docker"] +e2e-gpu = ["e2e"] +e2e-docker-gpu = ["e2e-docker", "e2e-gpu"] +e2e-podman = ["e2e"] +e2e-podman-gpu = ["e2e-podman", "e2e-gpu"] [[test]] name = "custom_image" path = "tests/custom_image.rs" required-features = ["e2e-docker"] -[[test]] -name = "docker_gpu" -path = "tests/docker_gpu.rs" -required-features = ["e2e-docker-gpu"] - [[test]] name = "docker_preflight" path = "tests/docker_preflight.rs" @@ -40,6 +38,11 @@ name = "gateway_resume" path = "tests/gateway_resume.rs" required-features = ["e2e-docker"] +[[test]] +name = "gpu_device_selection" +path = "tests/gpu_device_selection.rs" +required-features = ["e2e-gpu"] + [dependencies] tokio = { version = "1.43", features = ["full"] } tempfile = "3" diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 44c2eaeb4..c82891338 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -9,10 +9,19 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}" +E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}" cargo build -p openshell-cli --features openshell-core/dev-settings +TEST_ARGS=( + cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" + --features "${E2E_FEATURES}" +) +if [ -n "${E2E_TEST}" ]; then + TEST_ARGS+=(--test "${E2E_TEST}") +fi +TEST_ARGS+=(-- --nocapture) + exec "${ROOT}/e2e/with-podman-gateway.sh" \ - cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ - --features e2e \ - -- --nocapture + "${TEST_ARGS[@]}" diff --git a/e2e/rust/tests/docker_gpu.rs b/e2e/rust/tests/docker_gpu.rs deleted file mode 100644 index f85dc48b8..000000000 --- a/e2e/rust/tests/docker_gpu.rs +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#![cfg(feature = "e2e")] - -//! Docker GPU e2e test. -//! -//! Requires a Docker-backed gateway started with Docker CDI support. The -//! `e2e:docker:gpu` mise task starts that gateway with the default sandbox image -//! unless OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE is set. - -use openshell_e2e::harness::output::strip_ansi; -use openshell_e2e::harness::sandbox::SandboxGuard; - -#[tokio::test] -async fn docker_gpu_sandbox_runs_nvidia_smi() { - let mut guard = SandboxGuard::create(&[ - "--gpu", - "--", - "sh", - "-lc", - "gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \ - test -n \"$gpu_name\"; \ - printf 'gpu-ok:%s\n' \"$gpu_name\"", - ]) - .await - .expect("GPU sandbox create should succeed"); - - let output = strip_ansi(&guard.create_output); - assert!( - output.contains("gpu-ok:"), - "expected GPU smoke marker in sandbox output:\n{output}" - ); - - guard.cleanup().await; -} diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu_device_selection.rs new file mode 100644 index 000000000..4ca2c8dcd --- /dev/null +++ b/e2e/rust/tests/gpu_device_selection.rs @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +//! GPU device selection e2e tests. +//! +//! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. + +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::container::ContainerEngine; +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; +use tokio::time::timeout; + +const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); +const GPU_PROBE_DOCKERFILE_STAGE: &str = "gateway"; + +fn gpu_lines(output: &str) -> Vec { + strip_ansi(output) + .lines() + .map(str::trim) + .filter(|line| line.starts_with("GPU ")) + .map(ToOwned::to_owned) + .collect() +} + +fn gpu_uuid(line: &str) -> &str { + let (_, uuid) = line + .rsplit_once("(UUID: ") + .unwrap_or_else(|| panic!("GPU line did not include a UUID: {line}")); + uuid.strip_suffix(')').unwrap_or(uuid) +} + +fn workspace_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(2) + .expect("failed to resolve workspace root from CARGO_MANIFEST_DIR") + .to_path_buf() +} + +fn dockerfile_images_gpu_probe_image() -> String { + let dockerfile = workspace_root().join("deploy/docker/Dockerfile.images"); + let contents = std::fs::read_to_string(&dockerfile) + .unwrap_or_else(|err| panic!("failed to read {}: {err}", dockerfile.display())); + + contents + .lines() + .map(str::trim) + .find_map(|line| { + let mut parts = line.split_whitespace(); + let instruction = parts.next()?; + let image = parts.next()?; + let as_keyword = parts.next()?; + let stage = parts.next()?; + + if instruction.eq_ignore_ascii_case("FROM") + && as_keyword.eq_ignore_ascii_case("AS") + && stage == GPU_PROBE_DOCKERFILE_STAGE + { + Some(image) + } else { + None + } + }) + .unwrap_or_else(|| { + panic!( + "failed to find a FROM AS {GPU_PROBE_DOCKERFILE_STAGE} stage in {}", + dockerfile.display() + ) + }) + .to_string() +} + +fn gpu_probe_image() -> String { + std::env::var("OPENSHELL_E2E_GPU_PROBE_IMAGE") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(dockerfile_images_gpu_probe_image) +} + +fn runtime_gpu_lines() -> Vec { + let engine = ContainerEngine::from_env(); + let image = gpu_probe_image(); + let output = engine + .command() + .args([ + "run", + "--rm", + "--device", + "nvidia.com/gpu=all", + image.as_str(), + "nvidia-smi", + "-L", + ]) + .output() + .unwrap_or_else(|err| { + panic!( + "failed to run {} GPU probe container with image {image}: {err}", + engine.name() + ) + }); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + assert!( + output.status.success(), + "{} GPU probe failed with image {image} and status {:?}:\n{}", + engine.name(), + output.status.code(), + combined + ); + + let lines = gpu_lines(&stdout); + assert!( + !lines.is_empty(), + "{} GPU probe did not report any GPU lines with image {image}:\n{combined}", + engine.name() + ); + lines +} + +async fn sandbox_gpu_lines(gpu_device: Option<&str>) -> Vec { + let mut args = vec!["--gpu"]; + if let Some(gpu_device) = gpu_device { + args.push("--gpu-device"); + args.push(gpu_device); + } + args.extend(["--", "sh", "-lc", "nvidia-smi -L"]); + + let mut guard = SandboxGuard::create(&args) + .await + .expect("GPU sandbox create should succeed"); + + let lines = gpu_lines(&guard.create_output); + guard.cleanup().await; + lines +} + +async fn sandbox_create_output(args: &[&str]) -> String { + let mut cmd = openshell_cmd(); + cmd.arg("sandbox").arg("create").args(args); + cmd.stdout(Stdio::piped()).stderr(Stdio::piped()); + + let output = timeout(SANDBOX_CREATE_TIMEOUT, cmd.output()) + .await + .expect("sandbox create should complete before timeout") + .expect("openshell command should spawn"); + + assert!( + !output.status.success(), + "sandbox create unexpectedly succeeded with invalid GPU device" + ); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + strip_ansi(&format!("{stdout}{stderr}")) +} + +#[tokio::test] +async fn gpu_request_without_device_matches_plain_all_gpu_container() { + let expected = runtime_gpu_lines(); + let actual = sandbox_gpu_lines(None).await; + + assert_eq!( + actual, expected, + "default GPU request should expose the same GPU lines as a plain all-GPU container" + ); +} + +#[tokio::test] +async fn gpu_request_for_each_index_exposes_requested_gpu_uuid() { + let expected = runtime_gpu_lines(); + + for (index, expected_line) in expected.iter().enumerate() { + let gpu_device = format!("nvidia.com/gpu={index}"); + let actual = sandbox_gpu_lines(Some(&gpu_device)).await; + assert_eq!( + actual.len(), + 1, + "GPU request for {gpu_device} should expose one GPU line:\n{actual:#?}" + ); + + assert_eq!( + gpu_uuid(&actual[0]), + gpu_uuid(expected_line), + "GPU request for {gpu_device} should expose the matching physical GPU UUID" + ); + } +} + +#[tokio::test] +async fn gpu_all_device_request_matches_plain_all_gpu_container() { + let expected = runtime_gpu_lines(); + let actual = sandbox_gpu_lines(Some("nvidia.com/gpu=all")).await; + + assert_eq!( + actual, expected, + "explicit all-GPU request should expose the same GPU lines as a plain all-GPU container" + ); +} + +#[tokio::test] +async fn gpu_invalid_device_request_fails() { + let output = sandbox_create_output(&[ + "--gpu", + "--gpu-device", + "nvidia.com/gpu=invalid", + "--", + "sh", + "-lc", + "nvidia-smi -L", + ]) + .await; + let output_lower = output.to_ascii_lowercase(); + + assert!( + output.contains("nvidia.com/gpu=invalid") + || output_lower.contains("cdi") + || output_lower.contains("device"), + "expected invalid GPU device failure to mention the requested device or CDI/device resolution:\n{output}" + ); +} diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index ee8073f2b..fd74a4124 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -64,6 +64,7 @@ PODMAN_NETWORK_MANAGED=0 PODMAN_SERVICE_PID="" PODMAN_SERVICE_LOG="${WORKDIR}/podman-service.log" PODMAN_SOCKET="" +GPU_MODE="${OPENSHELL_E2E_PODMAN_GPU:-0}" # Isolate CLI/SDK gateway metadata from the developer's real config. export XDG_CONFIG_HOME="${WORKDIR}/config" diff --git a/tasks/test.toml b/tasks/test.toml index bf5741c72..4863f7f54 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -50,6 +50,11 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m g description = "Run Rust CLI e2e tests against a Podman-backed gateway" run = "e2e/rust/e2e-podman.sh" +["e2e:podman:gpu"] +description = "Run GPU e2e against a standalone gateway with the Podman compute driver" +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +run = "e2e/rust/e2e-podman.sh" + ["e2e:vm"] description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" @@ -60,5 +65,5 @@ run = "e2e/rust/e2e-docker.sh" ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "docker_gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh" From 8d42881c4b1457ee2750a8c6bd5d034e8ece7eda Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 8 May 2026 00:04:28 +0200 Subject: [PATCH 3/4] ci(gpu): run Docker GPU e2e workflow --- .github/workflows/e2e-gpu-test.yaml | 12 ++++++++---- tasks/test.toml | 6 +++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index f61c8c7ae..429a82524 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -14,7 +14,7 @@ permissions: jobs: e2e-gpu: - name: "E2E GPU (${{ matrix.name }})" + name: "E2E Docker GPU (${{ matrix.name }})" runs-on: ${{ matrix.runner }} continue-on-error: ${{ matrix.experimental }} timeout-minutes: 30 @@ -55,8 +55,12 @@ jobs: - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Install Python dependencies and generate protobuf stubs - run: uv sync --frozen && mise run --no-deps python:proto + - name: Check Docker GPU prerequisites + run: | + docker info --format '{{json .CDISpecDirs}}' + GPU_PROBE_IMAGE="$(awk '$1 == "FROM" && $3 == "AS" && $4 == "gateway" { print $2; exit }' deploy/docker/Dockerfile.images)" + test -n "${GPU_PROBE_IMAGE}" + docker run --rm --device nvidia.com/gpu=all "${GPU_PROBE_IMAGE}" nvidia-smi -L - name: Run tests - run: mise run --no-deps --skip-deps e2e:python:gpu + run: mise run --no-deps --skip-deps e2e:docker:gpu diff --git a/tasks/test.toml b/tasks/test.toml index 4863f7f54..747d3c362 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -12,7 +12,11 @@ description = "Run all end-to-end tests (Rust + Python)" depends = ["e2e:rust", "e2e:python"] ["e2e:gpu"] -description = "Run GPU end-to-end tests" +description = "Run Docker GPU end-to-end tests" +depends = ["e2e:docker:gpu"] + +["e2e:k3s:gpu"] +description = "Run k3s GPU end-to-end tests" depends = ["e2e:python:gpu"] ["test:rust"] From 817a97aa2fe6e13ddb27447d3d9af278cc710013 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 8 May 2026 00:12:37 +0200 Subject: [PATCH 4/4] docs(helm): fix README markdown lint --- deploy/helm/openshell/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index 3662b130a..ee7565f29 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -29,10 +29,10 @@ oc adm policy add-scc-to-user privileged -z default -n openshell # Deploy openshell with overrides to allow SCC assignment of fsGroup and runAsUser for the gateway helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version -n openshell \ - --set pkiInitJob.enabled=false \ - --set server.disableTls=true \ - --set podSecurityContext.fsGroup=null \ - --set securityContext.runAsUser=null + --set pkiInitJob.enabled=false \ + --set server.disableTls=true \ + --set podSecurityContext.fsGroup=null \ + --set securityContext.runAsUser=null ``` ## Available versions