Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/e2e-gpu-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ permissions:

jobs:
e2e-gpu:
name: "E2E GPU (${{ matrix.name }})"
name: "E2E Docker GPU (${{ matrix.name }})"
runs-on: ${{ matrix.runner }}
continue-on-error: ${{ matrix.experimental }}
timeout-minutes: 30
Expand Down Expand Up @@ -55,8 +55,10 @@ jobs:
- name: Log in to GHCR
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin

- name: Install Python dependencies and generate protobuf stubs
run: uv sync --frozen && mise run --no-deps python:proto
- name: Check Docker GPU prerequisites
run: |
nvidia-smi -L
docker info --format '{{json .CDISpecDirs}}'

- name: Run tests
run: mise run --no-deps --skip-deps e2e:python:gpu
run: mise run --no-deps --skip-deps e2e:docker:gpu
4 changes: 2 additions & 2 deletions crates/openshell-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1061,8 +1061,8 @@ enum SandboxCommands {
#[arg(long)]
gpu: bool,

/// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1").
/// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned.
/// Target GPUs by CDI device ID, for example "nvidia.com/gpu=0" or "nvidia.com/gpu=all".
/// Only valid with --gpu. When omitted with --gpu, all GPUs are requested.
#[arg(long, requires = "gpu")]
gpu_device: Option<String>,

Expand Down
48 changes: 48 additions & 0 deletions crates/openshell-core/src/gpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! Shared GPU request helpers.

use crate::config::CDI_GPU_DEVICE_ALL;

/// Resolve the existing GPU request fields into CDI device identifiers.
///
/// `None` means no GPU was requested. A GPU request with no explicit device
/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
/// through unchanged.
#[must_use]
pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
gpu.then(|| {
if gpu_device.is_empty() {
vec![CDI_GPU_DEVICE_ALL.to_string()]
} else {
vec![gpu_device.to_string()]
}
})
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn cdi_gpu_device_ids_returns_none_when_absent() {
assert_eq!(cdi_gpu_device_ids(false, ""), None);
}

#[test]
fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
assert_eq!(
cdi_gpu_device_ids(true, ""),
Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
);
}

#[test]
fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
assert_eq!(
cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
Some(vec!["nvidia.com/gpu=0".to_string()])
);
}
}
1 change: 1 addition & 0 deletions crates/openshell-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
pub mod config;
pub mod error;
pub mod forward;
pub mod gpu;
pub mod image;
pub mod inference;
pub mod metadata;
Expand Down
2 changes: 1 addition & 1 deletion crates/openshell-driver-docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ contract:
| `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |

The agent child process does not retain these supervisor privileges.

Expand Down
28 changes: 16 additions & 12 deletions crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ use bollard::query_parameters::{
};
use bytes::Bytes;
use futures::{Stream, StreamExt};
use openshell_core::config::{
CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS,
};
use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS};
use openshell_core::gpu::cdi_gpu_device_ids;
use openshell_core::proto::compute::v1::{
CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
Expand Down Expand Up @@ -309,11 +308,7 @@ impl DockerComputeDriver {
"docker sandboxes require a template image",
));
}
if spec.gpu && !config.supports_gpu {
return Err(Status::failed_precondition(
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
));
}
Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
if !template.agent_socket_path.trim().is_empty() {
return Err(Status::failed_precondition(
"docker compute driver does not support template.agent_socket_path",
Expand All @@ -333,6 +328,15 @@ impl DockerComputeDriver {
Ok(())
}

fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
if gpu && !supports_gpu {
return Err(Status::failed_precondition(
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
));
}
Ok(())
}

async fn get_sandbox_snapshot(
&self,
sandbox_id: &str,
Expand Down Expand Up @@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
.collect()
}

fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
gpu.then(|| {
fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option<Vec<DeviceRequest>> {
cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| {
vec![DeviceRequest {
driver: Some("cdi".to_string()),
device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
device_ids: Some(device_ids),
..Default::default()
}]
})
Expand Down Expand Up @@ -996,7 +1000,7 @@ fn build_container_create_body(
host_config: Some(HostConfig {
nano_cpus: resource_limits.nano_cpus,
memory: resource_limits.memory_bytes,
device_requests: docker_gpu_device_requests(spec.gpu),
device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
mounts: Some(build_mounts(config)),
restart_policy: Some(RestartPolicy {
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
Expand Down
26 changes: 25 additions & 1 deletion crates/openshell-driver-docker/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0

use super::*;
use openshell_core::config::DEFAULT_SERVER_PORT;
use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT};
use openshell_core::proto::compute::v1::{
DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate,
};
Expand Down Expand Up @@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() {
);
}

#[test]
fn build_container_create_body_passes_explicit_cdi_device_id_through() {
let mut config = runtime_config();
config.supports_gpu = true;
let mut sandbox = test_sandbox();
let spec = sandbox.spec.as_mut().unwrap();
spec.gpu = true;
spec.gpu_device = "nvidia.com/gpu=0".to_string();

let create_body = build_container_create_body(&sandbox, &config).unwrap();
let request = create_body
.host_config
.as_ref()
.and_then(|host_config| host_config.device_requests.as_ref())
.and_then(|requests| requests.first())
.expect("GPU request should add a Docker device request");

assert_eq!(request.driver.as_deref(), Some("cdi"));
assert_eq!(
request.device_ids.as_ref().unwrap(),
&vec!["nvidia.com/gpu=0".to_string()]
);
}

#[test]
fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() {
// Regression test: `delete_sandbox` (and the other identifier-keyed
Expand Down
1 change: 1 addition & 0 deletions crates/openshell-driver-podman/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields:
| `no_new_privileges` | `true` | Prevents privilege escalation after exec. |
| `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. |
| `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. |
| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. |

The restricted agent child does not retain these supervisor privileges.

Expand Down
63 changes: 55 additions & 8 deletions crates/openshell-driver-podman/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! Container spec construction for the Podman driver.

use crate::config::PodmanComputeConfig;
use openshell_core::config::CDI_GPU_DEVICE_ALL;
use openshell_core::gpu::cdi_gpu_device_ids;
use openshell_core::proto::compute::v1::DriverSandbox;
use serde::Serialize;
use serde_json::Value;
Expand Down Expand Up @@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {

/// Build CDI GPU device list if GPU is requested.
fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
if sandbox.spec.as_ref().is_some_and(|s| s.gpu) {
Some(vec![LinuxDevice {
path: CDI_GPU_DEVICE_ALL.into(),
}])
} else {
None
}
let spec = sandbox.spec.as_ref()?;
cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| {
device_ids
.into_iter()
.map(|path| LinuxDevice { path })
.collect()
})
}

/// Build the Podman container creation JSON spec.
Expand Down Expand Up @@ -687,6 +687,53 @@ mod tests {
assert_eq!(short_id("short"), "short");
}

#[test]
fn container_spec_omits_devices_without_gpu_request() {
let sandbox = test_sandbox("test-id", "test-name");
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert!(spec.get("devices").is_none());
}

#[test]
fn container_spec_maps_empty_gpu_request_to_all_cdi_device() {
use openshell_core::config::CDI_GPU_DEVICE_ALL;
use openshell_core::proto::compute::v1::DriverSandboxSpec;

let mut sandbox = test_sandbox("test-id", "test-name");
sandbox.spec = Some(DriverSandboxSpec {
gpu: true,
..Default::default()
});
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert_eq!(
spec["devices"][0]["path"].as_str(),
Some(CDI_GPU_DEVICE_ALL)
);
}

#[test]
fn container_spec_passes_explicit_cdi_device_id_through() {
use openshell_core::proto::compute::v1::DriverSandboxSpec;

let mut sandbox = test_sandbox("test-id", "test-name");
sandbox.spec = Some(DriverSandboxSpec {
gpu: true,
gpu_device: "nvidia.com/gpu=0".to_string(),
..Default::default()
});
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert_eq!(
spec["devices"][0]["path"].as_str(),
Some("nvidia.com/gpu=0")
);
}

#[test]
fn container_spec_includes_required_capabilities() {
let sandbox = test_sandbox("test-id", "test-name");
Expand Down
4 changes: 4 additions & 0 deletions crates/openshell-driver-podman/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@ impl PodmanComputeDriver {
sandbox: &DriverSandbox,
) -> Result<(), ComputeDriverError> {
let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu);
Self::validate_gpu_request(gpu_requested)
}

fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> {
if gpu_requested && !Self::has_gpu_capacity() {
return Err(ComputeDriverError::Precondition(
"GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(),
Expand Down
8 changes: 4 additions & 4 deletions deploy/helm/openshell/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ oc adm policy add-scc-to-user privileged -z default -n openshell

# Deploy openshell with overrides to allow SCC assignment of fsGroup and runAsUser for the gateway
helm install openshell oci://ghcr.io/nvidia/openshell/helm-chart --version <version> -n openshell \
--set pkiInitJob.enabled=false \
--set server.disableTls=true \
--set podSecurityContext.fsGroup=null \
--set securityContext.runAsUser=null
--set pkiInitJob.enabled=false \
--set server.disableTls=true \
--set podSecurityContext.fsGroup=null \
--set securityContext.runAsUser=null
```

## Available versions
Expand Down
15 changes: 9 additions & 6 deletions e2e/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,16 @@ publish = false
[features]
e2e = []
e2e-docker = ["e2e"]
e2e-docker-gpu = ["e2e-docker"]
e2e-gpu = ["e2e"]
e2e-docker-gpu = ["e2e-docker", "e2e-gpu"]
e2e-podman = ["e2e"]
e2e-podman-gpu = ["e2e-podman", "e2e-gpu"]

[[test]]
name = "custom_image"
path = "tests/custom_image.rs"
required-features = ["e2e-docker"]

[[test]]
name = "docker_gpu"
path = "tests/docker_gpu.rs"
required-features = ["e2e-docker-gpu"]

[[test]]
name = "docker_preflight"
path = "tests/docker_preflight.rs"
Expand All @@ -40,6 +38,11 @@ name = "gateway_resume"
path = "tests/gateway_resume.rs"
required-features = ["e2e-docker"]

[[test]]
name = "gpu_device_selection"
path = "tests/gpu_device_selection.rs"
required-features = ["e2e-gpu"]

[dependencies]
tokio = { version = "1.43", features = ["full"] }
tempfile = "3"
Expand Down
15 changes: 12 additions & 3 deletions e2e/rust/e2e-podman.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,19 @@
set -euo pipefail

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}"
E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}"

cargo build -p openshell-cli --features openshell-core/dev-settings

TEST_ARGS=(
cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml"
--features "${E2E_FEATURES}"
)
if [ -n "${E2E_TEST}" ]; then
TEST_ARGS+=(--test "${E2E_TEST}")
fi
TEST_ARGS+=(-- --nocapture)

exec "${ROOT}/e2e/with-podman-gateway.sh" \
cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
--features e2e \
-- --nocapture
"${TEST_ARGS[@]}"
Loading
Loading