diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml index f61c8c7ae..429a82524 100644 --- a/.github/workflows/e2e-gpu-test.yaml +++ b/.github/workflows/e2e-gpu-test.yaml @@ -14,7 +14,7 @@ permissions: jobs: e2e-gpu: - name: "E2E GPU (${{ matrix.name }})" + name: "E2E Docker GPU (${{ matrix.name }})" runs-on: ${{ matrix.runner }} continue-on-error: ${{ matrix.experimental }} timeout-minutes: 30 @@ -55,8 +55,12 @@ jobs: - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Install Python dependencies and generate protobuf stubs - run: uv sync --frozen && mise run --no-deps python:proto + - name: Check Docker GPU prerequisites + run: | + docker info --format '{{json .CDISpecDirs}}' + GPU_PROBE_IMAGE="$(awk '$1 == "FROM" && $3 == "AS" && $4 == "gateway" { print $2; exit }' deploy/docker/Dockerfile.images)" + test -n "${GPU_PROBE_IMAGE}" + docker run --rm --device nvidia.com/gpu=all "${GPU_PROBE_IMAGE}" nvidia-smi -L - name: Run tests - run: mise run --no-deps --skip-deps e2e:python:gpu + run: mise run --no-deps --skip-deps e2e:docker:gpu diff --git a/TESTING.md b/TESTING.md index 49c9b781a..7bcf2d203 100644 --- a/TESTING.md +++ b/TESTING.md @@ -151,6 +151,14 @@ Suites: - Docker suite (`--features e2e-docker`) - common suite plus Docker-only coverage such as Dockerfile image builds, Docker preflight checks, and managed Docker gateway resume. - Docker GPU suite (`--features e2e-docker-gpu`) - Docker suite plus GPU sandbox smoke coverage. +GPU device-selection tests compare OpenShell sandboxes against a plain Docker or +Podman container that requests `--device nvidia.com/gpu=all`. The probe image +defaults to the image used by the `gateway` stage in +`deploy/docker/Dockerfile.images`; set `OPENSHELL_E2E_GPU_PROBE_IMAGE` to +override it. Per-device checks run only for NVIDIA CDI device IDs reported by +the runtime's discovered devices list, so WSL2 hosts that expose only +`nvidia.com/gpu=all` skip the index-based cases. + Run the Docker-backed Rust CLI e2e suite: ```shell diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index cd14568ef..01c4c4916 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1061,8 +1061,9 @@ enum SandboxCommands { #[arg(long)] gpu: bool, - /// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1"). - /// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned. + /// Target a driver-specific GPU device. Docker and Podman use CDI device IDs + /// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index. + /// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection. #[arg(long, requires = "gpu")] gpu_device: Option, diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs new file mode 100644 index 000000000..5df8702ed --- /dev/null +++ b/crates/openshell-core/src/gpu.rs @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Shared GPU request helpers. + +use crate::config::CDI_GPU_DEVICE_ALL; + +/// Resolve the existing GPU request fields into CDI device identifiers. +/// +/// `None` means no GPU was requested. A GPU request with no explicit device +/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes +/// through unchanged. +#[must_use] +pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option> { + gpu.then(|| { + if gpu_device.is_empty() { + vec![CDI_GPU_DEVICE_ALL.to_string()] + } else { + vec![gpu_device.to_string()] + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cdi_gpu_device_ids_returns_none_when_absent() { + assert_eq!(cdi_gpu_device_ids(false, ""), None); + } + + #[test] + fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() { + assert_eq!( + cdi_gpu_device_ids(true, ""), + Some(vec![CDI_GPU_DEVICE_ALL.to_string()]) + ); + } + + #[test] + fn cdi_gpu_device_ids_passes_explicit_device_id_through() { + assert_eq!( + cdi_gpu_device_ids(true, "nvidia.com/gpu=0"), + Some(vec!["nvidia.com/gpu=0".to_string()]) + ); + } +} diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index a4a1ea822..893b01f5f 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -12,6 +12,7 @@ pub mod config; pub mod error; pub mod forward; +pub mod gpu; pub mod image; pub mod inference; pub mod metadata; diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 99b6e1385..20cfe6a0f 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -30,7 +30,7 @@ contract: | `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. | | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. | | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. | -| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. | +| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. | The agent child process does not retain these supervisor privileges. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index a864a3eb6..fb29482d9 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -18,9 +18,8 @@ use bollard::query_parameters::{ }; use bytes::Bytes; use futures::{Stream, StreamExt}; -use openshell_core::config::{ - CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS, -}; +use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS}; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, @@ -309,11 +308,7 @@ impl DockerComputeDriver { "docker sandboxes require a template image", )); } - if spec.gpu && !config.supports_gpu { - return Err(Status::failed_precondition( - "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", - )); - } + Self::validate_gpu_request(spec.gpu, config.supports_gpu)?; if !template.agent_socket_path.trim().is_empty() { return Err(Status::failed_precondition( "docker compute driver does not support template.agent_socket_path", @@ -333,6 +328,15 @@ impl DockerComputeDriver { Ok(()) } + fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> { + if gpu && !supports_gpu { + return Err(Status::failed_precondition( + "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.", + )); + } + Ok(()) + } + async fn get_sandbox_snapshot( &self, sandbox_id: &str, @@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig .collect() } -fn docker_gpu_device_requests(gpu: bool) -> Option> { - gpu.then(|| { +fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option> { + cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| { vec![DeviceRequest { driver: Some("cdi".to_string()), - device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]), + device_ids: Some(device_ids), ..Default::default() }] }) @@ -996,7 +1000,7 @@ fn build_container_create_body( host_config: Some(HostConfig { nano_cpus: resource_limits.nano_cpus, memory: resource_limits.memory_bytes, - device_requests: docker_gpu_device_requests(spec.gpu), + device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device), mounts: Some(build_mounts(config)), restart_policy: Some(RestartPolicy { name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index e41f2688e..9bda6da82 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use super::*; -use openshell_core::config::DEFAULT_SERVER_PORT; +use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT}; use openshell_core::proto::compute::v1::{ DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, }; @@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() { ); } +#[test] +fn build_container_create_body_passes_explicit_cdi_device_id_through() { + let mut config = runtime_config(); + config.supports_gpu = true; + let mut sandbox = test_sandbox(); + let spec = sandbox.spec.as_mut().unwrap(); + spec.gpu = true; + spec.gpu_device = "nvidia.com/gpu=0".to_string(); + + let create_body = build_container_create_body(&sandbox, &config).unwrap(); + let request = create_body + .host_config + .as_ref() + .and_then(|host_config| host_config.device_requests.as_ref()) + .and_then(|requests| requests.first()) + .expect("GPU request should add a Docker device request"); + + assert_eq!(request.driver.as_deref(), Some("cdi")); + assert_eq!( + request.device_ids.as_ref().unwrap(), + &vec!["nvidia.com/gpu=0".to_string()] + ); +} + #[test] fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() { // Regression test: `delete_sandbox` (and the other identifier-keyed diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index d853bb5ea..5b88010e4 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields: | `no_new_privileges` | `true` | Prevents privilege escalation after exec. | | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. | | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. | +| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. | The restricted agent child does not retain these supervisor privileges. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 3c5df292f..5b9b0d735 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -4,7 +4,7 @@ //! Container spec construction for the Podman driver. use crate::config::PodmanComputeConfig; -use openshell_core::config::CDI_GPU_DEVICE_ALL; +use openshell_core::gpu::cdi_gpu_device_ids; use openshell_core::proto::compute::v1::DriverSandbox; use serde::Serialize; use serde_json::Value; @@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits { /// Build CDI GPU device list if GPU is requested. fn build_devices(sandbox: &DriverSandbox) -> Option> { - if sandbox.spec.as_ref().is_some_and(|s| s.gpu) { - Some(vec![LinuxDevice { - path: CDI_GPU_DEVICE_ALL.into(), - }]) - } else { - None - } + let spec = sandbox.spec.as_ref()?; + cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| { + device_ids + .into_iter() + .map(|path| LinuxDevice { path }) + .collect() + }) } /// Build the Podman container creation JSON spec. @@ -687,6 +687,53 @@ mod tests { assert_eq!(short_id("short"), "short"); } + #[test] + fn container_spec_omits_devices_without_gpu_request() { + let sandbox = test_sandbox("test-id", "test-name"); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert!(spec.get("devices").is_none()); + } + + #[test] + fn container_spec_maps_empty_gpu_request_to_all_cdi_device() { + use openshell_core::config::CDI_GPU_DEVICE_ALL; + use openshell_core::proto::compute::v1::DriverSandboxSpec; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: true, + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some(CDI_GPU_DEVICE_ALL) + ); + } + + #[test] + fn container_spec_passes_explicit_cdi_device_id_through() { + use openshell_core::proto::compute::v1::DriverSandboxSpec; + + let mut sandbox = test_sandbox("test-id", "test-name"); + sandbox.spec = Some(DriverSandboxSpec { + gpu: true, + gpu_device: "nvidia.com/gpu=0".to_string(), + ..Default::default() + }); + let config = test_config(); + let spec = build_container_spec(&sandbox, &config); + + assert_eq!( + spec["devices"][0]["path"].as_str(), + Some("nvidia.com/gpu=0") + ); + } + #[test] fn container_spec_includes_required_capabilities() { let sandbox = test_sandbox("test-id", "test-name"); diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index ad4d7a192..f78c5c730 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -199,6 +199,10 @@ impl PodmanComputeDriver { sandbox: &DriverSandbox, ) -> Result<(), ComputeDriverError> { let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu); + Self::validate_gpu_request(gpu_requested) + } + + fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> { if gpu_requested && !Self::has_gpu_capacity() { return Err(ComputeDriverError::Precondition( "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(), diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 57bc1ff68..2357d3369 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -18,18 +18,16 @@ publish = false [features] e2e = [] e2e-docker = ["e2e"] -e2e-docker-gpu = ["e2e-docker"] +e2e-gpu = ["e2e"] +e2e-docker-gpu = ["e2e-docker", "e2e-gpu"] +e2e-podman = ["e2e"] +e2e-podman-gpu = ["e2e-podman", "e2e-gpu"] [[test]] name = "custom_image" path = "tests/custom_image.rs" required-features = ["e2e-docker"] -[[test]] -name = "docker_gpu" -path = "tests/docker_gpu.rs" -required-features = ["e2e-docker-gpu"] - [[test]] name = "docker_preflight" path = "tests/docker_preflight.rs" @@ -40,6 +38,11 @@ name = "gateway_resume" path = "tests/gateway_resume.rs" required-features = ["e2e-docker"] +[[test]] +name = "gpu_device_selection" +path = "tests/gpu_device_selection.rs" +required-features = ["e2e-gpu"] + [dependencies] tokio = { version = "1.43", features = ["full"] } tempfile = "3" diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 44c2eaeb4..c82891338 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -9,10 +9,19 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}" +E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}" cargo build -p openshell-cli --features openshell-core/dev-settings +TEST_ARGS=( + cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" + --features "${E2E_FEATURES}" +) +if [ -n "${E2E_TEST}" ]; then + TEST_ARGS+=(--test "${E2E_TEST}") +fi +TEST_ARGS+=(-- --nocapture) + exec "${ROOT}/e2e/with-podman-gateway.sh" \ - cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ - --features e2e \ - -- --nocapture + "${TEST_ARGS[@]}" diff --git a/e2e/rust/tests/docker_gpu.rs b/e2e/rust/tests/docker_gpu.rs deleted file mode 100644 index f85dc48b8..000000000 --- a/e2e/rust/tests/docker_gpu.rs +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#![cfg(feature = "e2e")] - -//! Docker GPU e2e test. -//! -//! Requires a Docker-backed gateway started with Docker CDI support. The -//! `e2e:docker:gpu` mise task starts that gateway with the default sandbox image -//! unless OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE is set. - -use openshell_e2e::harness::output::strip_ansi; -use openshell_e2e::harness::sandbox::SandboxGuard; - -#[tokio::test] -async fn docker_gpu_sandbox_runs_nvidia_smi() { - let mut guard = SandboxGuard::create(&[ - "--gpu", - "--", - "sh", - "-lc", - "gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \ - test -n \"$gpu_name\"; \ - printf 'gpu-ok:%s\n' \"$gpu_name\"", - ]) - .await - .expect("GPU sandbox create should succeed"); - - let output = strip_ansi(&guard.create_output); - assert!( - output.contains("gpu-ok:"), - "expected GPU smoke marker in sandbox output:\n{output}" - ); - - guard.cleanup().await; -} diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu_device_selection.rs new file mode 100644 index 000000000..930ae73e1 --- /dev/null +++ b/e2e/rust/tests/gpu_device_selection.rs @@ -0,0 +1,404 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +//! GPU device selection e2e tests. +//! +//! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. + +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::container::ContainerEngine; +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; +use serde_json::{Map, Value}; +use tokio::time::timeout; + +const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600); +const GPU_PROBE_DOCKERFILE_STAGE: &str = "gateway"; +const CDI_GPU_DEVICE_ALL: &str = "nvidia.com/gpu=all"; +const CDI_GPU_DEVICE_PREFIX: &str = "nvidia.com/gpu="; + +fn gpu_lines(output: &str) -> Vec { + strip_ansi(output) + .lines() + .map(str::trim) + .filter(|line| line.starts_with("GPU ")) + .map(ToOwned::to_owned) + .collect() +} + +fn workspace_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(2) + .expect("failed to resolve workspace root from CARGO_MANIFEST_DIR") + .to_path_buf() +} + +fn dockerfile_images_gpu_probe_image() -> String { + let dockerfile = workspace_root().join("deploy/docker/Dockerfile.images"); + let contents = std::fs::read_to_string(&dockerfile) + .unwrap_or_else(|err| panic!("failed to read {}: {err}", dockerfile.display())); + + contents + .lines() + .map(str::trim) + .find_map(|line| { + let mut parts = line.split_whitespace(); + let instruction = parts.next()?; + let image = parts.next()?; + let as_keyword = parts.next()?; + let stage = parts.next()?; + + if instruction.eq_ignore_ascii_case("FROM") + && as_keyword.eq_ignore_ascii_case("AS") + && stage == GPU_PROBE_DOCKERFILE_STAGE + { + Some(image) + } else { + None + } + }) + .unwrap_or_else(|| { + panic!( + "failed to find a FROM AS {GPU_PROBE_DOCKERFILE_STAGE} stage in {}", + dockerfile.display() + ) + }) + .to_string() +} + +fn gpu_probe_image() -> String { + std::env::var("OPENSHELL_E2E_GPU_PROBE_IMAGE") + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(dockerfile_images_gpu_probe_image) +} + +fn object_string<'a>(object: &'a Map, key: &str) -> Option<&'a str> { + object + .get(key) + .or_else(|| object.get(&key.to_ascii_lowercase())) + .and_then(Value::as_str) +} + +fn discovered_devices_array(info: &Value) -> Option<&Vec> { + info.get("DiscoveredDevices") + .or_else(|| info.get("discoveredDevices")) + .and_then(Value::as_array) +} + +fn host_discovered_devices_array(info: &Value) -> Option<&Vec> { + info.get("Host") + .or_else(|| info.get("host")) + .and_then(discovered_devices_array) +} + +fn collect_cdi_gpu_device_ids_from_devices(devices: &[Value], device_ids: &mut Vec) { + for device in devices { + let Some(device) = device.as_object() else { + continue; + }; + + if object_string(device, "Source") == Some("cdi") + && let Some(device_id) = object_string(device, "ID") + && device_id.starts_with(CDI_GPU_DEVICE_PREFIX) + { + device_ids.push(device_id.to_string()); + } + } +} + +fn parse_cdi_gpu_device_ids(info: &Value) -> Vec { + let mut device_ids = Vec::new(); + + if let Some(devices) = discovered_devices_array(info) { + collect_cdi_gpu_device_ids_from_devices(devices, &mut device_ids); + } + if let Some(devices) = host_discovered_devices_array(info) { + collect_cdi_gpu_device_ids_from_devices(devices, &mut device_ids); + } + + device_ids.sort(); + device_ids.dedup(); + device_ids +} + +fn discovered_cdi_gpu_device_ids() -> Vec { + let engine = ContainerEngine::from_env(); + let output = engine + .command() + .args(["info", "--format", "json"]) + .output() + .unwrap_or_else(|err| panic!("failed to run {} info: {err}", engine.name())); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + assert!( + output.status.success(), + "{} info --format json failed with status {:?}:\n{}", + engine.name(), + output.status.code(), + combined + ); + + let info: Value = serde_json::from_slice(&output.stdout).unwrap_or_else(|err| { + panic!( + "failed to parse {} info JSON: {err}\n{combined}", + engine.name() + ) + }); + let device_ids = parse_cdi_gpu_device_ids(&info); + assert!( + !device_ids.is_empty(), + "{} info --format json did not report any discovered NVIDIA CDI GPU devices. \ +Expected DiscoveredDevices entries with Source=cdi and ID like nvidia.com/gpu=all.", + engine.name() + ); + device_ids +} + +fn has_cdi_gpu_device(device_id: &str) -> bool { + discovered_cdi_gpu_device_ids() + .iter() + .any(|discovered| discovered == device_id) +} + +fn runtime_gpu_lines(gpu_device: &str) -> Vec { + let engine = ContainerEngine::from_env(); + let image = gpu_probe_image(); + let output = engine + .command() + .args([ + "run", + "--rm", + "--device", + gpu_device, + image.as_str(), + "nvidia-smi", + "-L", + ]) + .output() + .unwrap_or_else(|err| { + panic!( + "failed to run {} GPU probe container with image {image}: {err}", + engine.name() + ) + }); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + assert!( + output.status.success(), + "{} GPU probe failed for {gpu_device} with image {image} and status {:?}:\n{}", + engine.name(), + output.status.code(), + combined + ); + + let lines = gpu_lines(&stdout); + assert!( + !lines.is_empty(), + "{} GPU probe for {gpu_device} did not report any GPU lines with image {image}:\n{combined}", + engine.name() + ); + lines +} + +async fn sandbox_gpu_lines(gpu_device: Option<&str>) -> Vec { + let mut args = vec!["--gpu"]; + if let Some(gpu_device) = gpu_device { + args.push("--gpu-device"); + args.push(gpu_device); + } + args.extend(["--", "sh", "-lc", "nvidia-smi -L"]); + + let mut guard = SandboxGuard::create(&args) + .await + .expect("GPU sandbox create should succeed"); + + let lines = gpu_lines(&guard.create_output); + guard.cleanup().await; + lines +} + +async fn sandbox_create_output(args: &[&str]) -> String { + let mut cmd = openshell_cmd(); + cmd.arg("sandbox").arg("create").args(args); + cmd.stdout(Stdio::piped()).stderr(Stdio::piped()); + + let output = timeout(SANDBOX_CREATE_TIMEOUT, cmd.output()) + .await + .expect("sandbox create should complete before timeout") + .expect("openshell command should spawn"); + + assert!( + !output.status.success(), + "sandbox create unexpectedly succeeded with invalid GPU device" + ); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + strip_ansi(&format!("{stdout}{stderr}")) +} + +#[tokio::test] +async fn gpu_request_without_device_matches_plain_all_gpu_container() { + if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) { + eprintln!( + "skipping default GPU request test because {CDI_GPU_DEVICE_ALL} was not discovered" + ); + return; + } + + let expected = runtime_gpu_lines(CDI_GPU_DEVICE_ALL); + let actual = sandbox_gpu_lines(None).await; + + assert_eq!( + actual, expected, + "default GPU request should expose the same GPU lines as a plain all-GPU container" + ); +} + +#[tokio::test] +async fn gpu_request_for_each_discovered_device_matches_plain_container() { + let device_ids: Vec<_> = discovered_cdi_gpu_device_ids() + .into_iter() + .filter(|device_id| device_id != CDI_GPU_DEVICE_ALL) + .collect(); + + if device_ids.is_empty() { + eprintln!( + "skipping per-device GPU request test because no per-device NVIDIA CDI IDs were discovered" + ); + return; + } + + for gpu_device in device_ids { + let expected = runtime_gpu_lines(&gpu_device); + let actual = sandbox_gpu_lines(Some(&gpu_device)).await; + assert_eq!( + actual, expected, + "GPU request for {gpu_device} should expose the same GPU lines as a plain container" + ); + } +} + +#[tokio::test] +async fn gpu_all_device_request_matches_plain_all_gpu_container() { + if !has_cdi_gpu_device(CDI_GPU_DEVICE_ALL) { + eprintln!( + "skipping explicit all-GPU request test because {CDI_GPU_DEVICE_ALL} was not discovered" + ); + return; + } + + let expected = runtime_gpu_lines(CDI_GPU_DEVICE_ALL); + let actual = sandbox_gpu_lines(Some(CDI_GPU_DEVICE_ALL)).await; + + assert_eq!( + actual, expected, + "explicit all-GPU request should expose the same GPU lines as a plain all-GPU container" + ); +} + +#[tokio::test] +async fn gpu_invalid_device_request_fails() { + let output = sandbox_create_output(&[ + "--gpu", + "--gpu-device", + "nvidia.com/gpu=invalid", + "--", + "sh", + "-lc", + "nvidia-smi -L", + ]) + .await; + let output_lower = output.to_ascii_lowercase(); + + assert!( + output.contains("nvidia.com/gpu=invalid") + || output_lower.contains("cdi") + || output_lower.contains("device"), + "expected invalid GPU device failure to mention the requested device or CDI/device resolution:\n{output}" + ); +} + +#[test] +fn parse_cdi_gpu_device_ids_reads_discovered_devices() { + let info = serde_json::json!({ + "DiscoveredDevices": [ + { + "Source": "cdi", + "ID": "example.com/device=foo" + }, + { + "Source": "cdi", + "ID": "nvidia.com/gpu=0" + }, + { + "Source": "cdi", + "ID": "nvidia.com/gpu=all" + } + ] + }); + + assert_eq!( + parse_cdi_gpu_device_ids(&info), + vec![ + "nvidia.com/gpu=0".to_string(), + CDI_GPU_DEVICE_ALL.to_string() + ] + ); +} + +#[test] +fn parse_cdi_gpu_device_ids_reads_lowercase_host_discovered_devices() { + let info = serde_json::json!({ + "host": { + "discoveredDevices": [ + { + "source": "cdi", + "id": "nvidia.com/gpu=1" + }, + { + "Source": "cdi", + "ID": "nvidia.com/gpu=1" + }, + { + "Source": "udev", + "ID": "nvidia.com/gpu=2" + } + ] + } + }); + + assert_eq!( + parse_cdi_gpu_device_ids(&info), + vec!["nvidia.com/gpu=1".to_string()] + ); +} + +#[test] +fn parse_cdi_gpu_device_ids_ignores_unexpected_nested_devices() { + let info = serde_json::json!({ + "host": { + "devices": [ + { + "Source": "cdi", + "ID": "nvidia.com/gpu=2" + } + ] + } + }); + + assert!(parse_cdi_gpu_device_ids(&info).is_empty()); +} diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index ee8073f2b..fd74a4124 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -64,6 +64,7 @@ PODMAN_NETWORK_MANAGED=0 PODMAN_SERVICE_PID="" PODMAN_SERVICE_LOG="${WORKDIR}/podman-service.log" PODMAN_SOCKET="" +GPU_MODE="${OPENSHELL_E2E_PODMAN_GPU:-0}" # Isolate CLI/SDK gateway metadata from the developer's real config. export XDG_CONFIG_HOME="${WORKDIR}/config" diff --git a/tasks/test.toml b/tasks/test.toml index bf5741c72..747d3c362 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -12,7 +12,11 @@ description = "Run all end-to-end tests (Rust + Python)" depends = ["e2e:rust", "e2e:python"] ["e2e:gpu"] -description = "Run GPU end-to-end tests" +description = "Run Docker GPU end-to-end tests" +depends = ["e2e:docker:gpu"] + +["e2e:k3s:gpu"] +description = "Run k3s GPU end-to-end tests" depends = ["e2e:python:gpu"] ["test:rust"] @@ -50,6 +54,11 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m g description = "Run Rust CLI e2e tests against a Podman-backed gateway" run = "e2e/rust/e2e-podman.sh" +["e2e:podman:gpu"] +description = "Run GPU e2e against a standalone gateway with the Podman compute driver" +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +run = "e2e/rust/e2e-podman.sh" + ["e2e:vm"] description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" @@ -60,5 +69,5 @@ run = "e2e/rust/e2e-docker.sh" ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "docker_gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh"