NVIDIA · elezar · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
@@ -14,7 +14,7 @@ permissions:
 
 jobs:
   e2e-gpu:
-    name: "E2E GPU (${{ matrix.name }})"
+    name: "E2E Docker GPU (${{ matrix.name }})"
     runs-on: ${{ matrix.runner }}
     continue-on-error: ${{ matrix.experimental }}
     timeout-minutes: 30
@@ -55,8 +55,10 @@ jobs:
       - name: Log in to GHCR
         run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
 
-      - name: Install Python dependencies and generate protobuf stubs
-        run: uv sync --frozen && mise run --no-deps python:proto
+      - name: Check Docker GPU prerequisites
+        run: |
+          nvidia-smi -L
+          docker info --format '{{json .CDISpecDirs}}'
 
       - name: Run tests
-        run: mise run --no-deps --skip-deps e2e:python:gpu
+        run: mise run --no-deps --skip-deps e2e:docker:gpu
@@ -1061,8 +1061,8 @@ enum SandboxCommands {
         #[arg(long)]
         gpu: bool,
 
-        /// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1").
-        /// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned.
+        /// Target GPUs by CDI device ID, for example "nvidia.com/gpu=0" or "nvidia.com/gpu=all".
+        /// Only valid with --gpu. When omitted with --gpu, all GPUs are requested.
         #[arg(long, requires = "gpu")]
         gpu_device: Option<String>,
 

@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Shared GPU request helpers.
+
+use crate::config::CDI_GPU_DEVICE_ALL;
+
+/// Resolve the existing GPU request fields into CDI device identifiers.
+///
+/// `None` means no GPU was requested. A GPU request with no explicit device
+/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
+/// through unchanged.
+#[must_use]
+pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
+    gpu.then(|| {
+        if gpu_device.is_empty() {
+            vec![CDI_GPU_DEVICE_ALL.to_string()]
+        } else {
+            vec![gpu_device.to_string()]
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cdi_gpu_device_ids_returns_none_when_absent() {
+        assert_eq!(cdi_gpu_device_ids(false, ""), None);
+    }
+
+    #[test]
+    fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
+        assert_eq!(
+            cdi_gpu_device_ids(true, ""),
+            Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
+        );
+    }
+
+    #[test]
+    fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
+        assert_eq!(
+            cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
+            Some(vec!["nvidia.com/gpu=0".to_string()])
+        );
+    }
+}
@@ -12,6 +12,7 @@
 pub mod config;
 pub mod error;
 pub mod forward;
+pub mod gpu;
 pub mod image;
 pub mod inference;
 pub mod metadata;

@@ -30,7 +30,7 @@ contract:
 | `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
 | `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
 | `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
-| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
+| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
 
 The agent child process does not retain these supervisor privileges.
 

@@ -18,9 +18,8 @@ use bollard::query_parameters::{
 };
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
-use openshell_core::config::{
-    CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS,
-};
+use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS};
+use openshell_core::gpu::cdi_gpu_device_ids;
 use openshell_core::proto::compute::v1::{
     CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
     DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
@@ -309,11 +308,7 @@ impl DockerComputeDriver {
                 "docker sandboxes require a template image",
             ));
         }
-        if spec.gpu && !config.supports_gpu {
-            return Err(Status::failed_precondition(
-                "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
-            ));
-        }
+        Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
         if !template.agent_socket_path.trim().is_empty() {
             return Err(Status::failed_precondition(
                 "docker compute driver does not support template.agent_socket_path",
@@ -333,6 +328,15 @@ impl DockerComputeDriver {
         Ok(())
     }
 
+    fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
+        if gpu && !supports_gpu {
+            return Err(Status::failed_precondition(
+                "docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
+            ));
+        }
+        Ok(())
+    }
+
     async fn get_sandbox_snapshot(
         &self,
         sandbox_id: &str,
@@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
         .collect()
 }
 
-fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
-    gpu.then(|| {
+fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option<Vec<DeviceRequest>> {
+    cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| {
         vec![DeviceRequest {
             driver: Some("cdi".to_string()),
-            device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
+            device_ids: Some(device_ids),
             ..Default::default()
         }]
     })
@@ -996,7 +1000,7 @@ fn build_container_create_body(
         host_config: Some(HostConfig {
             nano_cpus: resource_limits.nano_cpus,
             memory: resource_limits.memory_bytes,
-            device_requests: docker_gpu_device_requests(spec.gpu),
+            device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
             mounts: Some(build_mounts(config)),
             restart_policy: Some(RestartPolicy {
                 name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),

@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::*;
-use openshell_core::config::DEFAULT_SERVER_PORT;
+use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT};
 use openshell_core::proto::compute::v1::{
     DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate,
 };
@@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() {
     );
 }
 
+#[test]
+fn build_container_create_body_passes_explicit_cdi_device_id_through() {
+    let mut config = runtime_config();
+    config.supports_gpu = true;
+    let mut sandbox = test_sandbox();
+    let spec = sandbox.spec.as_mut().unwrap();
+    spec.gpu = true;
+    spec.gpu_device = "nvidia.com/gpu=0".to_string();
+
+    let create_body = build_container_create_body(&sandbox, &config).unwrap();
+    let request = create_body
+        .host_config
+        .as_ref()
+        .and_then(|host_config| host_config.device_requests.as_ref())
+        .and_then(|requests| requests.first())
+        .expect("GPU request should add a Docker device request");
+
+    assert_eq!(request.driver.as_deref(), Some("cdi"));
+    assert_eq!(
+        request.device_ids.as_ref().unwrap(),
+        &vec!["nvidia.com/gpu=0".to_string()]
+    );
+}
+
 #[test]
 fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() {
     // Regression test: `delete_sandbox` (and the other identifier-keyed

@@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields:
 | `no_new_privileges` | `true` | Prevents privilege escalation after exec. |
 | `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. |
 | `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. |
+| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. |
 
 The restricted agent child does not retain these supervisor privileges.
 

@@ -4,7 +4,7 @@
 //! Container spec construction for the Podman driver.
 
 use crate::config::PodmanComputeConfig;
-use openshell_core::config::CDI_GPU_DEVICE_ALL;
+use openshell_core::gpu::cdi_gpu_device_ids;
 use openshell_core::proto::compute::v1::DriverSandbox;
 use serde::Serialize;
 use serde_json::Value;
@@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {
 
 /// Build CDI GPU device list if GPU is requested.
 fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
-    if sandbox.spec.as_ref().is_some_and(|s| s.gpu) {
-        Some(vec![LinuxDevice {
-            path: CDI_GPU_DEVICE_ALL.into(),
-        }])
-    } else {
-        None
-    }
+    let spec = sandbox.spec.as_ref()?;
+    cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| {
+        device_ids
+            .into_iter()
+            .map(|path| LinuxDevice { path })
+            .collect()
+    })
 }
 
 /// Build the Podman container creation JSON spec.
@@ -687,6 +687,53 @@ mod tests {
         assert_eq!(short_id("short"), "short");
     }
 
+    #[test]
+    fn container_spec_omits_devices_without_gpu_request() {
+        let sandbox = test_sandbox("test-id", "test-name");
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert!(spec.get("devices").is_none());
+    }
+
+    #[test]
+    fn container_spec_maps_empty_gpu_request_to_all_cdi_device() {
+        use openshell_core::config::CDI_GPU_DEVICE_ALL;
+        use openshell_core::proto::compute::v1::DriverSandboxSpec;
+
+        let mut sandbox = test_sandbox("test-id", "test-name");
+        sandbox.spec = Some(DriverSandboxSpec {
+            gpu: true,
+            ..Default::default()
+        });
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert_eq!(
+            spec["devices"][0]["path"].as_str(),
+            Some(CDI_GPU_DEVICE_ALL)
+        );
+    }
+
+    #[test]
+    fn container_spec_passes_explicit_cdi_device_id_through() {
+        use openshell_core::proto::compute::v1::DriverSandboxSpec;
+
+        let mut sandbox = test_sandbox("test-id", "test-name");
+        sandbox.spec = Some(DriverSandboxSpec {
+            gpu: true,
+            gpu_device: "nvidia.com/gpu=0".to_string(),
+            ..Default::default()
+        });
+        let config = test_config();
+        let spec = build_container_spec(&sandbox, &config);
+
+        assert_eq!(
+            spec["devices"][0]["path"].as_str(),
+            Some("nvidia.com/gpu=0")
+        );
+    }
+
     #[test]
     fn container_spec_includes_required_capabilities() {
         let sandbox = test_sandbox("test-id", "test-name");

@@ -199,6 +199,10 @@ impl PodmanComputeDriver {
         sandbox: &DriverSandbox,
     ) -> Result<(), ComputeDriverError> {
         let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu);
+        Self::validate_gpu_request(gpu_requested)
+    }
+
+    fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> {
         if gpu_requested && !Self::has_gpu_capacity() {
             return Err(ComputeDriverError::Precondition(
                 "GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(),

@@ -29,10 +29,10 @@ oc adm policy add-scc-to-user privileged -z default -n openshell
 
 # Deploy openshell with overrides to allow SCC assignment of fsGroup and runAsUser for the gateway
 helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version <version> -n openshell \
-	--set pkiInitJob.enabled=false \
-	--set server.disableTls=true \
-	--set podSecurityContext.fsGroup=null \
-	--set securityContext.runAsUser=null
+  --set pkiInitJob.enabled=false \
+  --set server.disableTls=true \
+  --set podSecurityContext.fsGroup=null \
+  --set securityContext.runAsUser=null
 ```
 
 ## Available versions

@@ -18,18 +18,16 @@ publish = false
 [features]
 e2e = []
 e2e-docker = ["e2e"]
-e2e-docker-gpu = ["e2e-docker"]
+e2e-gpu = ["e2e"]
+e2e-docker-gpu = ["e2e-docker", "e2e-gpu"]
+e2e-podman = ["e2e"]
+e2e-podman-gpu = ["e2e-podman", "e2e-gpu"]
 
 [[test]]
 name = "custom_image"
 path = "tests/custom_image.rs"
 required-features = ["e2e-docker"]
 
-[[test]]
-name = "docker_gpu"
-path = "tests/docker_gpu.rs"
-required-features = ["e2e-docker-gpu"]
-
 [[test]]
 name = "docker_preflight"
 path = "tests/docker_preflight.rs"
@@ -40,6 +38,11 @@ name = "gateway_resume"
 path = "tests/gateway_resume.rs"
 required-features = ["e2e-docker"]
 
+[[test]]
+name = "gpu_device_selection"
+path = "tests/gpu_device_selection.rs"
+required-features = ["e2e-gpu"]
+
 [dependencies]
 tokio = { version = "1.43", features = ["full"] }
 tempfile = "3"

@@ -9,10 +9,19 @@
 set -euo pipefail
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}"
+E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}"
 
 cargo build -p openshell-cli --features openshell-core/dev-settings
 
+TEST_ARGS=(
+  cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml"
+  --features "${E2E_FEATURES}"
+)
+if [ -n "${E2E_TEST}" ]; then
+  TEST_ARGS+=(--test "${E2E_TEST}")
+fi
+TEST_ARGS+=(-- --nocapture)
+
 exec "${ROOT}/e2e/with-podman-gateway.sh" \
-  cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
-    --features e2e \
-    -- --nocapture
+  "${TEST_ARGS[@]}"