From 16b25c152b11094dc6cc8bc014261bd134cc940e Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Thu, 7 May 2026 22:07:42 +0200
Subject: [PATCH 1/4] feat(gpu): honor device IDs in Docker and Podman

---
 crates/openshell-core/src/gpu.rs              | 48 ++++++++++++++
 crates/openshell-core/src/lib.rs              |  1 +
 crates/openshell-driver-docker/README.md      |  2 +-
 crates/openshell-driver-docker/src/lib.rs     | 28 +++++----
 crates/openshell-driver-docker/src/tests.rs   | 26 +++++++-
 crates/openshell-driver-podman/README.md      |  1 +
 .../openshell-driver-podman/src/container.rs  | 63 ++++++++++++++++---
 crates/openshell-driver-podman/src/driver.rs  |  4 ++
 8 files changed, 151 insertions(+), 22 deletions(-)
 create mode 100644 crates/openshell-core/src/gpu.rs
diff --git a/crates/openshell-core/src/gpu.rs b/crates/openshell-core/src/gpu.rs
new file mode 100644
index 000000000..5df8702ed
--- /dev/null
+++ b/crates/openshell-core/src/gpu.rs
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Shared GPU request helpers.
+
+use crate::config::CDI_GPU_DEVICE_ALL;
+
+/// Resolve the existing GPU request fields into CDI device identifiers.
+///
+/// `None` means no GPU was requested. A GPU request with no explicit device
+/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
+/// through unchanged.
+#[must_use]
+pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
+    gpu.then(|| {
+        if gpu_device.is_empty() {
+            vec![CDI_GPU_DEVICE_ALL.to_string()]
+        } else {
+            vec![gpu_device.to_string()]
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cdi_gpu_device_ids_returns_none_when_absent() {
+        assert_eq!(cdi_gpu_device_ids(false, ""), None);
+    }
+
+    #[test]
+    fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
+        assert_eq!(
+            cdi_gpu_device_ids(true, ""),
+            Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
+        );
+    }
+
+    #[test]
+    fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
+        assert_eq!(
+            cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
+            Some(vec!["nvidia.com/gpu=0".to_string()])
+        );
+    }
+}
diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs
index a4a1ea822..893b01f5f 100644
--- a/crates/openshell-core/src/lib.rs
+++ b/crates/openshell-core/src/lib.rs
@@ -12,6 +12,7 @@
 pub mod config;
 pub mod error;
 pub mod forward;
+pub mod gpu;
 pub mod image;
 pub mod inference;
 pub mod metadata;
diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md
index 7bc8048b2..2c02eaa5e 100644
--- a/crates/openshell-driver-docker/README.md
+++ b/crates/openshell-driver-docker/README.md
@@ -30,7 +30,7 @@ contract:
 | `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
 | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
 | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
-| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
+| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
 
 The agent child process does not retain these supervisor privileges.
 
diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs
index 0eaef3bce..4d2fb9eeb 100644
--- a/crates/openshell-driver-docker/src/lib.rs
+++ b/crates/openshell-driver-docker/src/lib.rs
@@ -18,9 +18,8 @@ use bollard::query_parameters::{
 };
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
-use openshell_core::config::{
-    CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS,
-};
+use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS};
+use openshell_core::gpu::cdi_gpu_device_ids;
 use openshell_core::proto::compute::v1::{
     CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
     DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
@@ -309,11 +308,7 @@ impl DockerComputeDriver {
                 "docker sandboxes require a template image",
             ));
         }
-        if spec.gpu && !config.supports_gpu {
-            return Err(Status::failed_precondition(
-                "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
-            ));
-        }
+        Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
         if !template.agent_socket_path.trim().is_empty() {
             return Err(Status::failed_precondition(
                 "docker compute driver does not support template.agent_socket_path",
@@ -333,6 +328,15 @@ impl DockerComputeDriver {
         Ok(())
     }
 
+    fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
+        if gpu && !supports_gpu {
+            return Err(Status::failed_precondition(
+                "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
+            ));
+        }
+        Ok(())
+    }
+
     async fn get_sandbox_snapshot(
         &self,
         sandbox_id: &str,
@@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
         .collect()
 }
 
-fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
-    gpu.then(|| {
+fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option<Vec<DeviceRequest>> {
+    cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| {
         vec![DeviceRequest {
             driver: Some("cdi".to_string()),
-            device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
+            device_ids: Some(device_ids),
             ..Default::default()
         }]
     })
@@ -996,7 +1000,7 @@ fn build_container_create_body(
         host_config: Some(HostConfig {
             nano_cpus: resource_limits.nano_cpus,
             memory: resource_limits.memory_bytes,
-            device_requests: docker_gpu_device_requests(spec.gpu),
+            device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
             mounts: Some(build_mounts(config)),
             restart_policy: Some(RestartPolicy {
                 name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs
index e41f2688e..9bda6da82 100644
--- a/crates/openshell-driver-docker/src/tests.rs
+++ b/crates/openshell-driver-docker/src/tests.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::*;
-use openshell_core::config::DEFAULT_SERVER_PORT;
+use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT};
 use openshell_core::proto::compute::v1::{
     DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate,
 };
@@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() {
     );
 }
 
+#[test]
+fn build_container_create_body_passes_explicit_cdi_device_id_through() {
+    let mut config = runtime_config();
+    config.supports_gpu = true;
+    let mut sandbox = test_sandbox();
+    let spec = sandbox.spec.as_mut().unwrap();
+    spec.gpu = true;
+    spec.gpu_device = "nvidia.com/gpu=0".to_string();
+
+    let create_body = build_container_create_body(&sandbox, &config).unwrap();
+    let request = create_body
+        .host_config
+        .as_ref()
+        .and_then(|host_config| host_config.device_requests.as_ref())
+        .and_then(|requests| requests.first())
+        .expect("GPU request should add a Docker device request");
+
+    assert_eq!(request.driver.as_deref(), Some("cdi"));
+    assert_eq!(
+        request.device_ids.as_ref().unwrap(),
+        &vec!["nvidia.com/gpu=0".to_string()]
+    );
+}
+
 #[test]
 fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() {
     // Regression test: `delete_sandbox` (and the other identifier-keyed
diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md
index d853bb5ea..5b88010e4 100644
--- a/crates/openshell-driver-podman/README.md
+++ b/crates/openshell-driver-podman/README.md
@@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields:
 | `no_new_privileges` | `true` | Prevents privilege escalation after exec. |
 | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. |
 | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. |
+| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. |
 
 The restricted agent child does not retain these supervisor privileges.
 
diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs
index 3c5df292f..5b9b0d735 100644
--- a/crates/openshell-driver-podman/src/container.rs
+++ b/crates/openshell-driver-podman/src/container.rs
@@ -4,7 +4,7 @@
 //! Container spec construction for the Podman driver.
 
 use crate::config::PodmanComputeConfig;
-use openshell_core::config::CDI_GPU_DEVICE_ALL;
+use openshell_core::gpu::cdi_gpu_device_ids;
 use openshell_core::proto::compute::v1::DriverSandbox;
 use serde::Serialize;
 use serde_json::Value;
@@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {
 
 /// Build CDI GPU device list if GPU is requested.
 fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
-    if sandbox.spec.as_ref().is_some_and(|s| s.gpu) {
-        Some(vec![LinuxDevice {
-            path: CDI_GPU_DEVICE_ALL.into(),
-        }])
-    } else {
-        None
-    }
+    let spec = sandbox.spec.as_ref()?;
+    cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| {
+        device_ids
+            .into_iter()
+            .map(|path| LinuxDevice { path })
+            .collect()
+    })
 }
 
 /// Build the Podman container creation JSON spec.
@@ -687,6 +687,53 @@ mod tests {
         assert_eq!(short_id("short"), "short");
     }
 
+    #[test]
+    fn container_spec_omits_devices_without_gpu_request() {
+        let sandbox = test_sandbox("test-id", "test-name");
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert!(spec.get("devices").is_none());
+    }
+
+    #[test]
+    fn container_spec_maps_empty_gpu_request_to_all_cdi_device() {
+        use openshell_core::config::CDI_GPU_DEVICE_ALL;
+        use openshell_core::proto::compute::v1::DriverSandboxSpec;
+
+        let mut sandbox = test_sandbox("test-id", "test-name");
+        sandbox.spec = Some(DriverSandboxSpec {
+            gpu: true,
+            ..Default::default()
+        });
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert_eq!(
+            spec["devices"][0]["path"].as_str(),
+            Some(CDI_GPU_DEVICE_ALL)
+        );
+    }
+
+    #[test]
+    fn container_spec_passes_explicit_cdi_device_id_through() {
+        use openshell_core::proto::compute::v1::DriverSandboxSpec;
+
+        let mut sandbox = test_sandbox("test-id", "test-name");
+        sandbox.spec = Some(DriverSandboxSpec {
+            gpu: true,
+            gpu_device: "nvidia.com/gpu=0".to_string(),
+            ..Default::default()
+        });
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert_eq!(
+            spec["devices"][0]["path"].as_str(),
+            Some("nvidia.com/gpu=0")
+        );
+    }
+
     #[test]
     fn container_spec_includes_required_capabilities() {
         let sandbox = test_sandbox("test-id", "test-name");
diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs
index ad4d7a192..f78c5c730 100644
--- a/crates/openshell-driver-podman/src/driver.rs
+++ b/crates/openshell-driver-podman/src/driver.rs
@@ -199,6 +199,10 @@ impl PodmanComputeDriver {
         sandbox: &DriverSandbox,
     ) -> Result<(), ComputeDriverError> {
         let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu);
+        Self::validate_gpu_request(gpu_requested)
+    }
+
+    fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> {
         if gpu_requested && !Self::has_gpu_capacity() {
             return Err(ComputeDriverError::Precondition(
                 "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(),

From 12712b4bc6a5198ca39885eb056e3aaab3b91f1f Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Thu, 7 May 2026 23:49:17 +0200
Subject: [PATCH 2/4] test(gpu): add Docker and Podman device selection e2e

---
 TESTING.md                             |   6 +
 crates/openshell-cli/src/main.rs       |   4 +-
 e2e/rust/Cargo.toml                    |  15 +-
 e2e/rust/e2e-podman.sh                 |  15 +-
 e2e/rust/tests/docker_gpu.rs           |  36 ----
 e2e/rust/tests/gpu_device_selection.rs | 230 +++++++++++++++++++++++++
 e2e/with-podman-gateway.sh             |   1 +
 tasks/test.toml                        |   7 +-
 8 files changed, 266 insertions(+), 48 deletions(-)
 delete mode 100644 e2e/rust/tests/docker_gpu.rs
 create mode 100644 e2e/rust/tests/gpu_device_selection.rs

diff --git a/TESTING.md b/TESTING.md
index 49c9b781a..bfcb5ea8a 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -151,6 +151,12 @@ Suites:
 - Docker suite (`--features e2e-docker`) - common suite plus Docker-only coverage such as Dockerfile image builds, Docker preflight checks, and managed Docker gateway resume.
 - Docker GPU suite (`--features e2e-docker-gpu`) - Docker suite plus GPU sandbox smoke coverage.
 
+GPU device-selection tests compare OpenShell sandboxes against a plain Docker or
+Podman container that requests `--device nvidia.com/gpu=all`. The probe image
+defaults to the image used by the `gateway` stage in
+`deploy/docker/Dockerfile.images`; set `OPENSHELL_E2E_GPU_PROBE_IMAGE` to
+override it.
+
 Run the Docker-backed Rust CLI e2e suite:
 
 ```shell
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index cd14568ef..7136e774f 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -1061,8 +1061,8 @@ enum SandboxCommands {
         #[arg(long)]
         gpu: bool,
 
-        /// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1").
-        /// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned.
+        /// Target GPUs by CDI device ID, for example "nvidia.com/gpu=0" or "nvidia.com/gpu=all".
+        /// Only valid with --gpu. When omitted with --gpu, all GPUs are requested.
         #[arg(long, requires = "gpu")]
         gpu_device: Option<String>,
 
diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml
index 57bc1ff68..2357d3369 100644
--- a/e2e/rust/Cargo.toml
+++ b/e2e/rust/Cargo.toml
@@ -18,18 +18,16 @@ publish = false
 [features]
 e2e = []
 e2e-docker = ["e2e"]
-e2e-docker-gpu = ["e2e-docker"]
+e2e-gpu = ["e2e"]
+e2e-docker-gpu = ["e2e-docker", "e2e-gpu"]
+e2e-podman = ["e2e"]
+e2e-podman-gpu = ["e2e-podman", "e2e-gpu"]
 
 [[test]]
 name = "custom_image"
 path = "tests/custom_image.rs"
 required-features = ["e2e-docker"]
 
-[[test]]
-name = "docker_gpu"
-path = "tests/docker_gpu.rs"
-required-features = ["e2e-docker-gpu"]
-
 [[test]]
 name = "docker_preflight"
 path = "tests/docker_preflight.rs"
@@ -40,6 +38,11 @@ name = "gateway_resume"
 path = "tests/gateway_resume.rs"
 required-features = ["e2e-docker"]
 
+[[test]]
+name = "gpu_device_selection"
+path = "tests/gpu_device_selection.rs"
+required-features = ["e2e-gpu"]
+
 [dependencies]
 tokio = { version = "1.43", features = ["full"] }
 tempfile = "3"
diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh
index 44c2eaeb4..c82891338 100755
--- a/e2e/rust/e2e-podman.sh
+++ b/e2e/rust/e2e-podman.sh
@@ -9,10 +9,19 @@
 set -euo pipefail
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}"
+E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}"
 
 cargo build -p openshell-cli --features openshell-core/dev-settings
 
+TEST_ARGS=(
+  cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml"
+  --features "${E2E_FEATURES}"
+)
+if [ -n "${E2E_TEST}" ]; then
+  TEST_ARGS+=(--test "${E2E_TEST}")
+fi
+TEST_ARGS+=(-- --nocapture)
+
 exec "${ROOT}/e2e/with-podman-gateway.sh" \
-  cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
-    --features e2e \
-    -- --nocapture
+  "${TEST_ARGS[@]}"
diff --git a/e2e/rust/tests/docker_gpu.rs b/e2e/rust/tests/docker_gpu.rs
deleted file mode 100644
index f85dc48b8..000000000
--- a/e2e/rust/tests/docker_gpu.rs
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#![cfg(feature = "e2e")]
-
-//! Docker GPU e2e test.
-//!
-//! Requires a Docker-backed gateway started with Docker CDI support. The
-//! `e2e:docker:gpu` mise task starts that gateway with the default sandbox image
-//! unless OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE is set.
-
-use openshell_e2e::harness::output::strip_ansi;
-use openshell_e2e::harness::sandbox::SandboxGuard;
-
-#[tokio::test]
-async fn docker_gpu_sandbox_runs_nvidia_smi() {
-    let mut guard = SandboxGuard::create(&[
-        "--gpu",
-        "--",
-        "sh",
-        "-lc",
-        "gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \
-         test -n \"$gpu_name\"; \
-         printf 'gpu-ok:%s\n' \"$gpu_name\"",
-    ])
-    .await
-    .expect("GPU sandbox create should succeed");
-
-    let output = strip_ansi(&guard.create_output);
-    assert!(
-        output.contains("gpu-ok:"),
-        "expected GPU smoke marker in sandbox output:\n{output}"
-    );
-
-    guard.cleanup().await;
-}
diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu_device_selection.rs
new file mode 100644
index 000000000..4ca2c8dcd
--- /dev/null
+++ b/e2e/rust/tests/gpu_device_selection.rs
@@ -0,0 +1,230 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#![cfg(feature = "e2e-gpu")]
+
+//! GPU device selection e2e tests.
+//!
+//! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`.
+
+use std::path::{Path, PathBuf};
+use std::process::Stdio;
+use std::time::Duration;
+
+use openshell_e2e::harness::binary::openshell_cmd;
+use openshell_e2e::harness::container::ContainerEngine;
+use openshell_e2e::harness::output::strip_ansi;
+use openshell_e2e::harness::sandbox::SandboxGuard;
+use tokio::time::timeout;
+
+const SANDBOX_CREATE_TIMEOUT: Duration = Duration::from_secs(600);
+const GPU_PROBE_DOCKERFILE_STAGE: &str = "gateway";
+
+fn gpu_lines(output: &str) -> Vec<String> {
+    strip_ansi(output)
+        .lines()
+        .map(str::trim)
+        .filter(|line| line.starts_with("GPU "))
+        .map(ToOwned::to_owned)
+        .collect()
+}
+
+fn gpu_uuid(line: &str) -> &str {
+    let (_, uuid) = line
+        .rsplit_once("(UUID: ")
+        .unwrap_or_else(|| panic!("GPU line did not include a UUID: {line}"));
+    uuid.strip_suffix(')').unwrap_or(uuid)
+}
+
+fn workspace_root() -> PathBuf {
+    Path::new(env!("CARGO_MANIFEST_DIR"))
+        .ancestors()
+        .nth(2)
+        .expect("failed to resolve workspace root from CARGO_MANIFEST_DIR")
+        .to_path_buf()
+}
+
+fn dockerfile_images_gpu_probe_image() -> String {
+    let dockerfile = workspace_root().join("deploy/docker/Dockerfile.images");
+    let contents = std::fs::read_to_string(&dockerfile)
+        .unwrap_or_else(|err| panic!("failed to read {}: {err}", dockerfile.display()));
+
+    contents
+        .lines()
+        .map(str::trim)
+        .find_map(|line| {
+            let mut parts = line.split_whitespace();
+            let instruction = parts.next()?;
+            let image = parts.next()?;
+            let as_keyword = parts.next()?;
+            let stage = parts.next()?;
+
+            if instruction.eq_ignore_ascii_case("FROM")
+                && as_keyword.eq_ignore_ascii_case("AS")
+                && stage == GPU_PROBE_DOCKERFILE_STAGE
+            {
+                Some(image)
+            } else {
+                None
+            }
+        })
+        .unwrap_or_else(|| {
+            panic!(
+                "failed to find a FROM <image> AS {GPU_PROBE_DOCKERFILE_STAGE} stage in {}",
+                dockerfile.display()
+            )
+        })
+        .to_string()
+}
+
+fn gpu_probe_image() -> String {
+    std::env::var("OPENSHELL_E2E_GPU_PROBE_IMAGE")
+        .ok()
+        .map(|value| value.trim().to_string())
+        .filter(|value| !value.is_empty())
+        .unwrap_or_else(dockerfile_images_gpu_probe_image)
+}
+
+fn runtime_gpu_lines() -> Vec<String> {
+    let engine = ContainerEngine::from_env();
+    let image = gpu_probe_image();
+    let output = engine
+        .command()
+        .args([
+            "run",
+            "--rm",
+            "--device",
+            "nvidia.com/gpu=all",
+            image.as_str(),
+            "nvidia-smi",
+            "-L",
+        ])
+        .output()
+        .unwrap_or_else(|err| {
+            panic!(
+                "failed to run {} GPU probe container with image {image}: {err}",
+                engine.name()
+            )
+        });
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    let combined = format!("{stdout}{stderr}");
+    assert!(
+        output.status.success(),
+        "{} GPU probe failed with image {image} and status {:?}:\n{}",
+        engine.name(),
+        output.status.code(),
+        combined
+    );
+
+    let lines = gpu_lines(&stdout);
+    assert!(
+        !lines.is_empty(),
+        "{} GPU probe did not report any GPU lines with image {image}:\n{combined}",
+        engine.name()
+    );
+    lines
+}
+
+async fn sandbox_gpu_lines(gpu_device: Option<&str>) -> Vec<String> {
+    let mut args = vec!["--gpu"];
+    if let Some(gpu_device) = gpu_device {
+        args.push("--gpu-device");
+        args.push(gpu_device);
+    }
+    args.extend(["--", "sh", "-lc", "nvidia-smi -L"]);
+
+    let mut guard = SandboxGuard::create(&args)
+        .await
+        .expect("GPU sandbox create should succeed");
+
+    let lines = gpu_lines(&guard.create_output);
+    guard.cleanup().await;
+    lines
+}
+
+async fn sandbox_create_output(args: &[&str]) -> String {
+    let mut cmd = openshell_cmd();
+    cmd.arg("sandbox").arg("create").args(args);
+    cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
+
+    let output = timeout(SANDBOX_CREATE_TIMEOUT, cmd.output())
+        .await
+        .expect("sandbox create should complete before timeout")
+        .expect("openshell command should spawn");
+
+    assert!(
+        !output.status.success(),
+        "sandbox create unexpectedly succeeded with invalid GPU device"
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    strip_ansi(&format!("{stdout}{stderr}"))
+}
+
+#[tokio::test]
+async fn gpu_request_without_device_matches_plain_all_gpu_container() {
+    let expected = runtime_gpu_lines();
+    let actual = sandbox_gpu_lines(None).await;
+
+    assert_eq!(
+        actual, expected,
+        "default GPU request should expose the same GPU lines as a plain all-GPU container"
+    );
+}
+
+#[tokio::test]
+async fn gpu_request_for_each_index_exposes_requested_gpu_uuid() {
+    let expected = runtime_gpu_lines();
+
+    for (index, expected_line) in expected.iter().enumerate() {
+        let gpu_device = format!("nvidia.com/gpu={index}");
+        let actual = sandbox_gpu_lines(Some(&gpu_device)).await;
+        assert_eq!(
+            actual.len(),
+            1,
+            "GPU request for {gpu_device} should expose one GPU line:\n{actual:#?}"
+        );
+
+        assert_eq!(
+            gpu_uuid(&actual[0]),
+            gpu_uuid(expected_line),
+            "GPU request for {gpu_device} should expose the matching physical GPU UUID"
+        );
+    }
+}
+
+#[tokio::test]
+async fn gpu_all_device_request_matches_plain_all_gpu_container() {
+    let expected = runtime_gpu_lines();
+    let actual = sandbox_gpu_lines(Some("nvidia.com/gpu=all")).await;
+
+    assert_eq!(
+        actual, expected,
+        "explicit all-GPU request should expose the same GPU lines as a plain all-GPU container"
+    );
+}
+
+#[tokio::test]
+async fn gpu_invalid_device_request_fails() {
+    let output = sandbox_create_output(&[
+        "--gpu",
+        "--gpu-device",
+        "nvidia.com/gpu=invalid",
+        "--",
+        "sh",
+        "-lc",
+        "nvidia-smi -L",
+    ])
+    .await;
+    let output_lower = output.to_ascii_lowercase();
+
+    assert!(
+        output.contains("nvidia.com/gpu=invalid")
+            || output_lower.contains("cdi")
+            || output_lower.contains("device"),
+        "expected invalid GPU device failure to mention the requested device or CDI/device resolution:\n{output}"
+    );
+}
diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh
index ee8073f2b..fd74a4124 100755
--- a/e2e/with-podman-gateway.sh
+++ b/e2e/with-podman-gateway.sh
@@ -64,6 +64,7 @@ PODMAN_NETWORK_MANAGED=0
 PODMAN_SERVICE_PID=""
 PODMAN_SERVICE_LOG="${WORKDIR}/podman-service.log"
 PODMAN_SOCKET=""
+GPU_MODE="${OPENSHELL_E2E_PODMAN_GPU:-0}"
 
 # Isolate CLI/SDK gateway metadata from the developer's real config.
 export XDG_CONFIG_HOME="${WORKDIR}/config"
diff --git a/tasks/test.toml b/tasks/test.toml
index bf5741c72..4863f7f54 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -50,6 +50,11 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m g
 description = "Run Rust CLI e2e tests against a Podman-backed gateway"
 run = "e2e/rust/e2e-podman.sh"
 
+["e2e:podman:gpu"]
+description = "Run GPU e2e against a standalone gateway with the Podman compute driver"
+env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" }
+run = "e2e/rust/e2e-podman.sh"
+
 ["e2e:vm"]
 description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e"
 run = "e2e/rust/e2e-vm.sh"
@@ -60,5 +65,5 @@ run = "e2e/rust/e2e-docker.sh"
 
 ["e2e:docker:gpu"]
 description = "Run GPU e2e against a standalone gateway with the Docker compute driver"
-env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "docker_gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" }
+env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" }
 run = "e2e/rust/e2e-docker.sh"

From 8d42881c4b1457ee2750a8c6bd5d034e8ece7eda Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Fri, 8 May 2026 00:04:28 +0200
Subject: [PATCH 3/4] ci(gpu): run Docker GPU e2e workflow

---
 .github/workflows/e2e-gpu-test.yaml | 12 ++++++++----
 tasks/test.toml                     |  6 +++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml
index f61c8c7ae..429a82524 100644
--- a/.github/workflows/e2e-gpu-test.yaml
+++ b/.github/workflows/e2e-gpu-test.yaml
@@ -14,7 +14,7 @@ permissions:
 
 jobs:
   e2e-gpu:
-    name: "E2E GPU (${{ matrix.name }})"
+    name: "E2E Docker GPU (${{ matrix.name }})"
     runs-on: ${{ matrix.runner }}
     continue-on-error: ${{ matrix.experimental }}
     timeout-minutes: 30
@@ -55,8 +55,12 @@ jobs:
       - name: Log in to GHCR
         run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
 
-      - name: Install Python dependencies and generate protobuf stubs
-        run: uv sync --frozen && mise run --no-deps python:proto
+      - name: Check Docker GPU prerequisites
+        run: |
+          docker info --format '{{json .CDISpecDirs}}'
+          GPU_PROBE_IMAGE="$(awk '$1 == "FROM" && $3 == "AS" && $4 == "gateway" { print $2; exit }' deploy/docker/Dockerfile.images)"
+          test -n "${GPU_PROBE_IMAGE}"
+          docker run --rm --device nvidia.com/gpu=all "${GPU_PROBE_IMAGE}" nvidia-smi -L
 
       - name: Run tests
-        run: mise run --no-deps --skip-deps e2e:python:gpu
+        run: mise run --no-deps --skip-deps e2e:docker:gpu
diff --git a/tasks/test.toml b/tasks/test.toml
index 4863f7f54..747d3c362 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -12,7 +12,11 @@ description = "Run all end-to-end tests (Rust + Python)"
 depends = ["e2e:rust", "e2e:python"]
 
 ["e2e:gpu"]
-description = "Run GPU end-to-end tests"
+description = "Run Docker GPU end-to-end tests"
+depends = ["e2e:docker:gpu"]
+
+["e2e:k3s:gpu"]
+description = "Run k3s GPU end-to-end tests"
 depends = ["e2e:python:gpu"]
 
 ["test:rust"]

From 817a97aa2fe6e13ddb27447d3d9af278cc710013 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Fri, 8 May 2026 00:12:37 +0200
Subject: [PATCH 4/4] docs(helm): fix README markdown lint

---
 deploy/helm/openshell/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md
index 3662b130a..ee7565f29 100644
--- a/deploy/helm/openshell/README.md
+++ b/deploy/helm/openshell/README.md
@@ -29,10 +29,10 @@ oc adm policy add-scc-to-user privileged -z default -n openshell
 
 # Deploy openshell with overrides to allow SCC assignment of fsGroup and runAsUser for the gateway
 helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version <version> -n openshell \
-	--set pkiInitJob.enabled=false \
-	--set server.disableTls=true \
-	--set podSecurityContext.fsGroup=null \
-	--set securityContext.runAsUser=null
+  --set pkiInitJob.enabled=false \
+  --set server.disableTls=true \
+  --set podSecurityContext.fsGroup=null \
+  --set securityContext.runAsUser=null
 ```
 
 ## Available versions