diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 18d8c241e..623efb2e6 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -57,7 +57,8 @@ mise run helm:skaffold:run ``` Both commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm -chart. The `pkiInitJob` hook runs on first install to generate mTLS secrets. Envoy Gateway opt-in; see the Optional Add-ons section below. +chart. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway generate-certs`) +generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional Add-ons section below. The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or `kubectl port-forward`. diff --git a/Cargo.lock b/Cargo.lock index 28d86ea93..808956cd9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3630,10 +3630,13 @@ dependencies = [ "hyper-util", "ipnet", "jsonwebtoken 9.3.1", + "k8s-openapi", + "kube", "metrics", "metrics-exporter-prometheus", "miette", "nix", + "openshell-bootstrap", "openshell-core", "openshell-driver-docker", "openshell-driver-kubernetes", diff --git a/architecture/gateway.md b/architecture/gateway.md index f36878cf1..d89706e64 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -132,6 +132,32 @@ The same relay pattern backs interactive SSH, command execution, and file sync. The gateway tracks live sessions in memory and persists session records so tokens can expire or be revoked. +## PKI Bootstrap + +`openshell-gateway generate-certs` is the one place mTLS materials are +created. Both deployment paths use it: + +| Output mode | Selector | Layout | +|---|---|---| +| Kubernetes Secrets | (default) `--namespace`, `--server-secret-name`, `--client-secret-name` | Two `kubernetes.io/tls` Secrets with `tls.crt` / `tls.key` / `ca.crt`. | +| Filesystem | `--output-dir ` | `/{ca.crt, ca.key, server/tls.{crt,key}, client/tls.{crt,key}}`. Also copies client materials to `$XDG_CONFIG_HOME/openshell/gateways/openshell/mtls/` for CLI auto-discovery. | + +On Kubernetes, the Helm chart runs the command via a pre-install/pre-upgrade +hook Job using the gateway image itself — no separate cert-generation image, +no extra mirror burden in air-gapped environments. On the RPM gateway, the +same command runs from the systemd unit's `ExecStartPre` to bootstrap PKI +into the user's state directory on first start. + +Both modes share the same idempotency contract: all targets present → skip; +partial state → fail with a recovery hint; nothing present → generate and +write. This guards mTLS continuity across restarts and upgrades while still +recovering cleanly if an operator deletes everything and starts over. + +Operators who manage PKI externally (cert-manager, an enterprise CA, or +pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`. +The chart also ships a `certManager.*` path that produces equivalent Secrets +through cert-manager `Issuer`/`Certificate` resources. + ## Operational Constraints - Gateway TLS and client certificate distribution are deployment concerns owned diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index fab20186c..9cba99045 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -15,6 +15,7 @@ name = "openshell-gateway" path = "src/main.rs" [dependencies] +openshell-bootstrap = { path = "../openshell-bootstrap" } openshell-core = { path = "../openshell-core" } openshell-driver-docker = { path = "../openshell-driver-docker" } openshell-driver-kubernetes = { path = "../openshell-driver-kubernetes" } @@ -24,6 +25,10 @@ openshell-policy = { path = "../openshell-policy" } openshell-providers = { path = "../openshell-providers" } openshell-router = { path = "../openshell-router" } +# Kubernetes client (used by the `generate-certs` subcommand) +kube = { workspace = true } +k8s-openapi = { workspace = true } + # Async runtime tokio = { workspace = true } diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs new file mode 100644 index 000000000..b9e4d7bd5 --- /dev/null +++ b/crates/openshell-server/src/certgen.rs @@ -0,0 +1,525 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! `generate-certs` subcommand: bootstrap mTLS PKI for the gateway. +//! +//! Two output modes, dispatched by the presence of `--output-dir`: +//! +//! - **Kubernetes mode** (default): create two `kubernetes.io/tls` Secrets +//! in the supplied namespace. Used by the Helm pre-install hook. Requires +//! `--namespace`, `--server-secret-name`, `--client-secret-name`. +//! - **Local mode** (`--output-dir `): write PEMs to a filesystem layout +//! matching `deploy/rpm/init-pki.sh`. Used by the RPM systemd unit's +//! `ExecStartPre`. Also copies client materials to +//! `$XDG_CONFIG_HOME/openshell/gateways/openshell/mtls/` so the local CLI +//! picks them up automatically. +//! +//! Both modes share the same idempotency contract: all targets present → +//! skip; partial state → error with a recovery hint; nothing present → +//! generate and write. + +use clap::Args; +use k8s_openapi::ByteString; +use k8s_openapi::api::core::v1::Secret; +use kube::Client; +use kube::api::{Api, ObjectMeta, PostParams}; +use miette::{IntoDiagnostic, Result, WrapErr}; +use openshell_bootstrap::pki::{PkiBundle, generate_pki}; +use openshell_core::paths::{create_dir_restricted, set_file_owner_only}; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use tracing::{info, warn}; +use tracing_subscriber::EnvFilter; + +#[derive(Args, Debug)] +pub struct CertgenArgs { + /// Write PEMs to a filesystem directory instead of Kubernetes Secrets. + /// When set, the kube-related flags are not required. + #[arg(long, value_name = "DIR")] + output_dir: Option, + + /// Kubernetes namespace to create Secrets in. + /// Default comes from `POD_NAMESPACE`, which the Helm hook injects via + /// the downward API. + #[arg(long, env = "POD_NAMESPACE", required_unless_present = "output_dir")] + namespace: Option, + + /// Name of the server TLS Secret (`kubernetes.io/tls`) to create. + #[arg(long, required_unless_present = "output_dir")] + server_secret_name: Option, + + /// Name of the client TLS Secret (`kubernetes.io/tls`) to create. + #[arg(long, required_unless_present = "output_dir")] + client_secret_name: Option, + + /// Extra Subject Alternative Name for the server certificate. Repeatable. + /// Auto-detected as an IP address or DNS name. + #[arg(long = "server-san", value_name = "SAN")] + server_sans: Vec, + + /// Print the generated PEM materials to stdout instead of writing them. + /// For local debugging. + #[arg(long)] + dry_run: bool, +} + +pub async fn run(args: CertgenArgs) -> Result<()> { + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + ) + .init(); + + let bundle = generate_pki(&args.server_sans)?; + + if args.dry_run { + print_bundle(&bundle); + return Ok(()); + } + + if let Some(dir) = args.output_dir.as_deref() { + run_local(dir, &bundle) + } else { + run_kubernetes(&args, &bundle).await + } +} + +// ─────────────────────────── Kubernetes mode ─────────────────────────── + +#[derive(Debug, PartialEq, Eq)] +enum K8sAction { + SkipExists, + PartialState, + Create, +} + +fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction { + match (server_exists, client_exists) { + (true, true) => K8sAction::SkipExists, + (false, false) => K8sAction::Create, + _ => K8sAction::PartialState, + } +} + +async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { + let namespace = args + .namespace + .as_deref() + .ok_or_else(|| miette::miette!("--namespace is required (or set POD_NAMESPACE)"))?; + let server_name = args + .server_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--server-secret-name is required"))?; + let client_name = args + .client_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--client-secret-name is required"))?; + + let client = Client::try_default() + .await + .into_diagnostic() + .wrap_err("failed to construct in-cluster Kubernetes client")?; + let api: Api = Api::namespaced(client, namespace); + + let server_exists = api + .get_opt(server_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {server_name}"))? + .is_some(); + let client_exists = api + .get_opt(client_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {client_name}"))? + .is_some(); + + match decide_k8s(server_exists, client_exists) { + K8sAction::SkipExists => { + info!( + namespace = %namespace, + server = %server_name, + client = %client_name, + "PKI secrets already exist, skipping." + ); + return Ok(()); + } + K8sAction::PartialState => { + return Err(miette::miette!( + "partial PKI state in namespace {namespace}: exactly one of \ + {server_name} / {client_name} exists. Recover with: \ + kubectl delete secret -n {namespace} {server_name} {client_name}", + )); + } + K8sAction::Create => {} + } + + let server_secret = tls_secret( + server_name, + &bundle.server_cert_pem, + &bundle.server_key_pem, + &bundle.ca_cert_pem, + ); + let client_secret = tls_secret( + client_name, + &bundle.client_cert_pem, + &bundle.client_key_pem, + &bundle.ca_cert_pem, + ); + + api.create(&PostParams::default(), &server_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {server_name}"))?; + api.create(&PostParams::default(), &client_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {client_name}"))?; + + info!( + namespace = %namespace, + server = %server_name, + client = %client_name, + "PKI secrets created." + ); + Ok(()) +} + +fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret { + let mut data = BTreeMap::new(); + data.insert( + "tls.crt".to_string(), + ByteString(crt_pem.as_bytes().to_vec()), + ); + data.insert( + "tls.key".to_string(), + ByteString(key_pem.as_bytes().to_vec()), + ); + data.insert("ca.crt".to_string(), ByteString(ca_pem.as_bytes().to_vec())); + Secret { + metadata: ObjectMeta { + name: Some(name.to_string()), + ..Default::default() + }, + type_: Some("kubernetes.io/tls".to_string()), + data: Some(data), + ..Default::default() + } +} + +// ─────────────────────────────── Local mode ─────────────────────────────── + +#[derive(Debug, PartialEq, Eq)] +enum LocalAction { + Skip, + PartialState, + Create, +} + +/// Layout under `` matches `deploy/rpm/init-pki.sh`: +/// +/// ```text +/// /ca.crt +/// /ca.key +/// /server/tls.crt +/// /server/tls.key +/// /client/tls.crt +/// /client/tls.key +/// ``` +struct LocalPaths { + ca_crt: PathBuf, + ca_key: PathBuf, + server_dir: PathBuf, + server_crt: PathBuf, + server_key: PathBuf, + client_dir: PathBuf, + client_crt: PathBuf, + client_key: PathBuf, +} + +impl LocalPaths { + fn resolve(dir: &Path) -> Self { + let server_dir = dir.join("server"); + let client_dir = dir.join("client"); + Self { + ca_crt: dir.join("ca.crt"), + ca_key: dir.join("ca.key"), + server_crt: server_dir.join("tls.crt"), + server_key: server_dir.join("tls.key"), + server_dir, + client_crt: client_dir.join("tls.crt"), + client_key: client_dir.join("tls.key"), + client_dir, + } + } + + fn all_files(&self) -> [&Path; 6] { + [ + &self.ca_crt, + &self.ca_key, + &self.server_crt, + &self.server_key, + &self.client_crt, + &self.client_key, + ] + } + + fn existence_count(&self) -> usize { + self.all_files().iter().filter(|p| p.exists()).count() + } +} + +fn decide_local(present: usize) -> LocalAction { + match present { + 6 => LocalAction::Skip, + 0 => LocalAction::Create, + _ => LocalAction::PartialState, + } +} + +fn run_local(dir: &Path, bundle: &PkiBundle) -> Result<()> { + let paths = LocalPaths::resolve(dir); + + match decide_local(paths.existence_count()) { + LocalAction::Skip => { + info!(dir = %dir.display(), "PKI files already exist, skipping."); + } + LocalAction::PartialState => { + return Err(miette::miette!( + "partial PKI state in {dir}: some files exist but not all. \ + Recover with: rm -rf {dir} (the gateway will regenerate on next start)", + dir = dir.display(), + )); + } + LocalAction::Create => { + write_local_bundle(dir, bundle, &paths)?; + info!(dir = %dir.display(), "PKI files created."); + } + } + + // Always make sure the CLI auto-discovery copy is in place. This + // self-heals the case where the operator wiped ~/.config/openshell but + // left the gateway state directory intact. + if let Err(e) = openshell_bootstrap::mtls::store_pki_bundle("openshell", bundle) { + warn!(error = %e, "failed to copy client mTLS materials for CLI auto-discovery"); + } + + Ok(()) +} + +fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Result<()> { + // Stage to a sibling tmp dir so individual renames into the final layout + // are atomic on the same filesystem. + let temp = sibling_temp_dir(dir); + if temp.exists() { + std::fs::remove_dir_all(&temp) + .into_diagnostic() + .wrap_err_with(|| format!("failed to remove stale {}", temp.display()))?; + } + + let temp_server = temp.join("server"); + let temp_client = temp.join("client"); + create_dir_restricted(&temp)?; + create_dir_restricted(&temp_server)?; + create_dir_restricted(&temp_client)?; + + write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?; + write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?; + write_pem(&temp_server.join("tls.crt"), &bundle.server_cert_pem, false)?; + write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?; + write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?; + write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?; + + // Final destination (might not exist yet on first run). + create_dir_restricted(dir)?; + create_dir_restricted(&paths.server_dir)?; + create_dir_restricted(&paths.client_dir)?; + + let renames: [(PathBuf, &Path); 6] = [ + (temp.join("ca.crt"), paths.ca_crt.as_path()), + (temp.join("ca.key"), paths.ca_key.as_path()), + (temp_server.join("tls.crt"), paths.server_crt.as_path()), + (temp_server.join("tls.key"), paths.server_key.as_path()), + (temp_client.join("tls.crt"), paths.client_crt.as_path()), + (temp_client.join("tls.key"), paths.client_key.as_path()), + ]; + for (from, to) in &renames { + std::fs::rename(from, to) + .into_diagnostic() + .wrap_err_with(|| format!("failed to move {} -> {}", from.display(), to.display()))?; + } + + let _ = std::fs::remove_dir_all(&temp); + Ok(()) +} + +fn write_pem(path: &Path, contents: &str, owner_only: bool) -> Result<()> { + std::fs::write(path, contents) + .into_diagnostic() + .wrap_err_with(|| format!("failed to write {}", path.display()))?; + if owner_only { + set_file_owner_only(path)?; + } + Ok(()) +} + +fn sibling_temp_dir(dir: &Path) -> PathBuf { + // Use a sibling so std::fs::rename succeeds (same filesystem). + let mut name = dir + .file_name() + .map(std::ffi::OsStr::to_os_string) + .unwrap_or_default(); + name.push(".certgen.tmp"); + dir.with_file_name(name) +} + +// ────────────────────────────── Shared utility ───────────────────────────── + +fn print_bundle(bundle: &PkiBundle) { + println!("# CA certificate\n{}", bundle.ca_cert_pem); + println!("# Server certificate\n{}", bundle.server_cert_pem); + println!("# Server key\n{}", bundle.server_key_pem); + println!("# Client certificate\n{}", bundle.client_cert_pem); + println!("# Client key\n{}", bundle.client_key_pem); +} + +#[cfg(test)] +mod tests { + use super::{ + K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, sibling_temp_dir, tls_secret, + write_local_bundle, + }; + use openshell_bootstrap::pki::generate_pki; + use std::path::Path; + + // ── Kubernetes-mode decision ── + + #[test] + fn decide_k8s_skip_when_both_exist() { + assert_eq!(decide_k8s(true, true), K8sAction::SkipExists); + } + + #[test] + fn decide_k8s_create_when_neither_exists() { + assert_eq!(decide_k8s(false, false), K8sAction::Create); + } + + #[test] + fn decide_k8s_partial_when_only_server_exists() { + assert_eq!(decide_k8s(true, false), K8sAction::PartialState); + } + + #[test] + fn decide_k8s_partial_when_only_client_exists() { + assert_eq!(decide_k8s(false, true), K8sAction::PartialState); + } + + #[test] + fn tls_secret_has_kubernetes_io_tls_type_and_three_keys() { + let s = tls_secret("foo", "CRT-PEM", "KEY-PEM", "CA-PEM"); + assert_eq!(s.metadata.name.as_deref(), Some("foo")); + assert_eq!(s.type_.as_deref(), Some("kubernetes.io/tls")); + let data = s.data.expect("data set"); + assert_eq!(data.len(), 3); + assert_eq!(data["tls.crt"].0, b"CRT-PEM"); + assert_eq!(data["tls.key"].0, b"KEY-PEM"); + assert_eq!(data["ca.crt"].0, b"CA-PEM"); + } + + // ── Local-mode decision ── + + #[test] + fn decide_local_skip_when_all_six_present() { + assert_eq!(decide_local(6), LocalAction::Skip); + } + + #[test] + fn decide_local_create_when_none_present() { + assert_eq!(decide_local(0), LocalAction::Create); + } + + #[test] + fn decide_local_partial_for_any_count_in_between() { + for n in 1..=5 { + assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}"); + } + } + + // ── Local-mode layout & writes ── + + #[test] + fn local_paths_resolve_matches_init_pki_layout() { + let p = LocalPaths::resolve(Path::new("/tmp/openshell/tls")); + assert_eq!(p.ca_crt, Path::new("/tmp/openshell/tls/ca.crt")); + assert_eq!(p.ca_key, Path::new("/tmp/openshell/tls/ca.key")); + assert_eq!(p.server_crt, Path::new("/tmp/openshell/tls/server/tls.crt")); + assert_eq!(p.server_key, Path::new("/tmp/openshell/tls/server/tls.key")); + assert_eq!(p.client_crt, Path::new("/tmp/openshell/tls/client/tls.crt")); + assert_eq!(p.client_key, Path::new("/tmp/openshell/tls/client/tls.key")); + } + + #[test] + fn sibling_temp_dir_is_adjacent_to_target() { + assert_eq!( + sibling_temp_dir(Path::new("/var/lib/openshell/tls")), + Path::new("/var/lib/openshell/tls.certgen.tmp") + ); + } + + #[test] + fn write_local_bundle_writes_six_files_and_removes_temp() { + let parent = tempfile::tempdir().expect("tempdir"); + let dir = parent.path().join("tls"); + let bundle = generate_pki(&[]).expect("generate_pki"); + let paths = LocalPaths::resolve(&dir); + + write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle"); + + for f in paths.all_files() { + assert!(f.is_file(), "missing {}", f.display()); + } + assert!( + !sibling_temp_dir(&dir).exists(), + "temp dir should be cleaned up" + ); + + // Spot-check contents. + let ca = std::fs::read_to_string(&paths.ca_crt).unwrap(); + assert!(ca.contains("BEGIN CERTIFICATE")); + let server_key = std::fs::read_to_string(&paths.server_key).unwrap(); + assert!(server_key.contains("BEGIN PRIVATE KEY")); + } + + #[cfg(unix)] + #[test] + fn write_local_bundle_sets_owner_only_on_keys() { + use std::os::unix::fs::PermissionsExt; + let parent = tempfile::tempdir().expect("tempdir"); + let dir = parent.path().join("tls"); + let bundle = generate_pki(&[]).expect("generate_pki"); + let paths = LocalPaths::resolve(&dir); + + write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle"); + + for key in [&paths.ca_key, &paths.server_key, &paths.client_key] { + let mode = std::fs::metadata(key).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o600, "key {} has mode {:o}", key.display(), mode); + } + } + + #[test] + fn write_local_bundle_recovers_from_stale_temp_dir() { + let parent = tempfile::tempdir().expect("tempdir"); + let dir = parent.path().join("tls"); + let stale = sibling_temp_dir(&dir); + std::fs::create_dir_all(&stale).unwrap(); + std::fs::write(stale.join("garbage"), "stale").unwrap(); + + let bundle = generate_pki(&[]).expect("generate_pki"); + let paths = LocalPaths::resolve(&dir); + write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle"); + + assert!(paths.ca_crt.is_file()); + assert!(!stale.exists(), "stale temp dir should be removed"); + } +} diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index ccc08cf2b..b928be2e0 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -15,14 +15,34 @@ use std::path::PathBuf; use tracing::info; use tracing_subscriber::EnvFilter; +use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. +/// +/// Top-level CLI. When invoked without a subcommand the binary runs the +/// gateway server using `RunArgs`. The `generate-certs` subcommand is used by +/// the Helm pre-install hook to bootstrap mTLS Secrets. #[derive(Parser, Debug)] #[command(version = openshell_core::VERSION)] #[command(about = "OpenShell gRPC/HTTP server", long_about = None)] -struct Args { +struct Cli { + #[command(subcommand)] + command: Option, + + #[command(flatten)] + run: RunArgs, +} + +#[derive(clap::Subcommand, Debug)] +enum Commands { + /// Generate mTLS PKI and write Kubernetes Secrets (Helm pre-install hook). + GenerateCerts(certgen::CertgenArgs), +} + +#[derive(clap::Args, Debug)] +struct RunArgs { /// IP address to bind the server, health, and metrics listeners to. #[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")] bind_address: IpAddr, @@ -58,8 +78,12 @@ struct Args { tls_client_ca: Option, /// Database URL for persistence. - #[arg(long, env = "OPENSHELL_DB_URL", required = true)] - db_url: String, + /// + /// Required when running the gateway. Validated at the call site rather + /// than as a clap-level requirement so the `generate-certs` subcommand + /// (which does not need a database) can run without it. + #[arg(long, env = "OPENSHELL_DB_URL")] + db_url: Option, /// Compute drivers configured for this gateway. /// @@ -279,7 +303,7 @@ struct Args { } pub fn command() -> Command { - Args::command() + Cli::command() .name("openshell-gateway") .bin_name("openshell-gateway") } @@ -289,12 +313,15 @@ pub async fn run_cli() -> Result<()> { .install_default() .map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?; - let args = Args::from_arg_matches(&command().get_matches()).expect("clap validated args"); + let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args"); - Box::pin(run_from_args(args)).await + match cli.command { + Some(Commands::GenerateCerts(args)) => certgen::run(args).await, + None => Box::pin(run_from_args(cli.run)).await, + } } -async fn run_from_args(args: Args) -> Result<()> { +async fn run_from_args(args: RunArgs) -> Result<()> { let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -326,6 +353,10 @@ async fn run_from_args(args: Args) -> Result<()> { }) }; + let db_url = args + .db_url + .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; + let mut config = openshell_core::Config::new(tls) .with_bind_address(bind) .with_log_level(&args.log_level); @@ -359,7 +390,7 @@ async fn run_from_args(args: Args) -> Result<()> { } config = config - .with_database_url(args.db_url) + .with_database_url(db_url) .with_compute_drivers(args.drivers) .with_sandbox_namespace(args.sandbox_namespace) .with_ssh_gateway_host(args.ssh_gateway_host) @@ -444,7 +475,7 @@ fn parse_compute_driver(value: &str) -> std::result::Result at the clap level so subcommand parsing + // does not require it. The Run path validates it inside + // run_from_args. This test asserts the parse step succeeds with no + // --db-url, mirroring what the runtime check sees. + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_DB_URL"); + + let cli = Cli::try_parse_from(["openshell-gateway"]).expect("parses without --db-url"); + assert!(cli.command.is_none()); + assert!(cli.run.db_url.is_none()); } } diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index a80301c12..f63cf1915 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -20,6 +20,7 @@ //! [`compute::vm`]; keep this file driver-agnostic going forward. mod auth; +pub mod certgen; pub mod cli; mod compute; mod grpc; diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index ee7565f29..cc856731d 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -52,3 +52,21 @@ See [`values.yaml`](values.yaml) for configurable values. Selected overlays: - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration + +## PKI bootstrap + +By default, a pre-install/pre-upgrade hook Job runs `openshell-gateway generate-certs` +to create the gateway's server and client mTLS Secrets. The Job uses the gateway image +itself, so air-gapped environments only need to mirror that one image (no separate +openssl/alpine sidecar). + +The Job is idempotent: + +- Both target Secrets exist → log and exit 0. +- Exactly one exists → fail with `kubectl delete secret -n ` recovery hint. +- Neither exists → generate a CA, server cert, and client cert; POST both `kubernetes.io/tls` + Secrets (`tls.crt`, `tls.key`, `ca.crt`). + +Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager, +external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the +cert-manager alternative. diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml new file mode 100644 index 000000000..d8136d581 --- /dev/null +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +{{- if and .Values.pkiInitJob.enabled .Values.certManager.enabled }} +{{- fail "pkiInitJob.enabled and certManager.enabled cannot both be true; disable one to avoid conflicting PKI sources." }} +{{- end }} +{{- if .Values.pkiInitJob.enabled }} +{{- $hookName := printf "%s-certgen" (include "openshell.fullname" .) }} +{{- $ns := .Release.Namespace }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $hookName }} + namespace: {{ $ns }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-30" + helm.sh/hook-delete-policy: before-hook-creation +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $hookName }} + namespace: {{ $ns }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-30" + helm.sh/hook-delete-policy: before-hook-creation +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $hookName }} + namespace: {{ $ns }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-30" + helm.sh/hook-delete-policy: before-hook-creation +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $hookName }} +subjects: + - kind: ServiceAccount + name: {{ $hookName }} + namespace: {{ $ns }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ $hookName }} + namespace: {{ $ns }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-20" + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded +spec: + backoffLimit: 3 + activeDeadlineSeconds: 120 + ttlSecondsAfterFinished: 300 + template: + metadata: + labels: + {{- include "openshell.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + serviceAccountName: {{ $hookName }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: certgen + image: {{ include "openshell.image" . | quote }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: ["openshell-gateway"] + args: + - generate-certs + - --server-secret-name={{ .Values.server.tls.certSecretName }} + - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }} + {{- range .Values.pkiInitJob.serverDnsNames }} + - --server-san={{ . }} + {{- end }} + {{- range .Values.pkiInitJob.serverIpAddresses }} + - --server-san={{ . }} + {{- end }} +{{- end }} diff --git a/deploy/helm/openshell/templates/pki-hook.yaml b/deploy/helm/openshell/templates/pki-hook.yaml deleted file mode 100644 index c5e83c734..000000000 --- a/deploy/helm/openshell/templates/pki-hook.yaml +++ /dev/null @@ -1,191 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -{{- if and .Values.pkiInitJob.enabled .Values.certManager.enabled }} -{{- fail "pkiInitJob.enabled and certManager.enabled cannot both be true; disable one to avoid conflicting PKI sources." }} -{{- end }} -{{- if .Values.pkiInitJob.enabled }} -{{- $hookName := printf "%s-pki-hook" (include "openshell.fullname" .) }} -{{- $ns := .Release.Namespace }} -{{- $serverSecret := .Values.server.tls.certSecretName }} -{{- $clientSecret := .Values.server.tls.clientTlsSecretName }} -{{- $sanParts := list }} -{{- range .Values.pkiInitJob.serverDnsNames }}{{- $sanParts = append $sanParts (printf "DNS:%s" .) }}{{- end }} -{{- range .Values.pkiInitJob.serverIpAddresses }}{{- $sanParts = append $sanParts (printf "IP:%s" .) }}{{- end }} -{{- $serverSans := join "," $sanParts }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ $hookName }} - namespace: {{ $ns }} - labels: - {{- include "openshell.labels" . | nindent 4 }} - annotations: - helm.sh/hook: pre-install,pre-upgrade - helm.sh/hook-weight: "-30" - helm.sh/hook-delete-policy: before-hook-creation ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ $hookName }} - namespace: {{ $ns }} - labels: - {{- include "openshell.labels" . | nindent 4 }} - annotations: - helm.sh/hook: pre-install,pre-upgrade - helm.sh/hook-weight: "-30" - helm.sh/hook-delete-policy: before-hook-creation -rules: - - apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "create"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ $hookName }} - namespace: {{ $ns }} - labels: - {{- include "openshell.labels" . | nindent 4 }} - annotations: - helm.sh/hook: pre-install,pre-upgrade - helm.sh/hook-weight: "-30" - helm.sh/hook-delete-policy: before-hook-creation -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ $hookName }} -subjects: - - kind: ServiceAccount - name: {{ $hookName }} - namespace: {{ $ns }} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ $hookName }} - namespace: {{ $ns }} - labels: - {{- include "openshell.labels" . | nindent 4 }} - annotations: - helm.sh/hook: pre-install,pre-upgrade - helm.sh/hook-weight: "-20" - helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded -spec: - backoffLimit: 3 - activeDeadlineSeconds: 120 - ttlSecondsAfterFinished: 300 - template: - metadata: - labels: - {{- include "openshell.selectorLabels" . | nindent 8 }} - spec: - restartPolicy: OnFailure - serviceAccountName: {{ $hookName }} - containers: - - name: pki-gen - image: {{ .Values.pkiInitJob.image.repository }}:{{ .Values.pkiInitJob.image.tag }} - imagePullPolicy: {{ .Values.pkiInitJob.image.pullPolicy }} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: SERVER_SECRET - value: {{ $serverSecret | quote }} - - name: CLIENT_SECRET - value: {{ $clientSecret | quote }} - - name: CA_DAYS - value: {{ .Values.pkiInitJob.caValidityDays | quote }} - - name: CERT_DAYS - value: {{ .Values.pkiInitJob.certValidityDays | quote }} - - name: SERVER_SANS - value: {{ $serverSans | quote }} - command: - - /bin/sh - - -c - - | - set -eu - apk add --no-cache openssl curl >/dev/null 2>&1 - - TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) - K8S_CA=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt - API=https://kubernetes.default.svc - - # Idempotency: skip only when both TLS secrets already exist. - # Checking one is insufficient — a partial cleanup can leave one half - # of the pair behind, which would cause mTLS to fail at runtime. - HTTP_SERVER=$(curl -s -o /dev/null -w "%{http_code}" \ - -H "Authorization: Bearer $TOKEN" --cacert "$K8S_CA" \ - "$API/api/v1/namespaces/$NAMESPACE/secrets/$SERVER_SECRET") - HTTP_CLIENT=$(curl -s -o /dev/null -w "%{http_code}" \ - -H "Authorization: Bearer $TOKEN" --cacert "$K8S_CA" \ - "$API/api/v1/namespaces/$NAMESPACE/secrets/$CLIENT_SECRET") - if [ "$HTTP_SERVER" = "200" ] && [ "$HTTP_CLIENT" = "200" ]; then - echo "PKI secrets already exist, skipping." - exit 0 - fi - if [ "$HTTP_SERVER" = "200" ] || [ "$HTTP_CLIENT" = "200" ]; then - echo "ERROR: partial PKI state — one secret exists but not both." >&2 - echo "To recover: kubectl delete secret -n $NAMESPACE $SERVER_SECRET $CLIENT_SECRET" >&2 - exit 1 - fi - - cd /tmp - - # CA (ECDSA P-256) - openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out ca.key 2>/dev/null - openssl req -new -x509 -sha256 -key ca.key -out ca.crt \ - -days "$CA_DAYS" -subj "/O=openshell/CN=openshell-ca" \ - -addext "basicConstraints=critical,CA:TRUE,pathlen:0" \ - -addext "keyUsage=critical,keyCertSign,cRLSign" - - # Server cert (ECDSA P-256) - printf "[ext]\nsubjectAltName=%s\nextendedKeyUsage=serverAuth\nkeyUsage=digitalSignature,keyEncipherment\n" \ - "$SERVER_SANS" > server.ext - openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out server.key 2>/dev/null - openssl req -new -sha256 -key server.key -out server.csr -subj "/CN=openshell-server" - openssl x509 -req -sha256 -in server.csr -CA ca.crt -CAkey ca.key \ - -CAcreateserial -days "$CERT_DAYS" -extensions ext -extfile server.ext -out server.crt - - # Client cert (ECDSA P-256) - printf "[ext]\nextendedKeyUsage=clientAuth\nkeyUsage=digitalSignature,keyEncipherment\n" \ - > client.ext - openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out client.key 2>/dev/null - openssl req -new -sha256 -key client.key -out client.csr -subj "/CN=openshell-client" - openssl x509 -req -sha256 -in client.csr -CA ca.crt -CAkey ca.key \ - -CAcreateserial -days "$CERT_DAYS" -extensions ext -extfile client.ext -out client.crt - - CA_B64=$(base64 -w0 ca.crt) - SERVER_CRT_B64=$(base64 -w0 server.crt) - SERVER_KEY_B64=$(base64 -w0 server.key) - CLIENT_CRT_B64=$(base64 -w0 client.crt) - CLIENT_KEY_B64=$(base64 -w0 client.key) - - # Create server TLS secret - printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"%s","namespace":"%s"},"type":"kubernetes.io/tls","data":{"tls.crt":"%s","tls.key":"%s","ca.crt":"%s"}}\n' \ - "$SERVER_SECRET" "$NAMESPACE" \ - "$SERVER_CRT_B64" "$SERVER_KEY_B64" "$CA_B64" > server-secret.json - curl -sf -X POST \ - -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" \ - --cacert "$K8S_CA" "$API/api/v1/namespaces/$NAMESPACE/secrets" \ - -d @server-secret.json - - # Create client TLS secret - printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"%s","namespace":"%s"},"type":"kubernetes.io/tls","data":{"tls.crt":"%s","tls.key":"%s","ca.crt":"%s"}}\n' \ - "$CLIENT_SECRET" "$NAMESPACE" \ - "$CLIENT_CRT_B64" "$CLIENT_KEY_B64" "$CA_B64" > client-secret.json - curl -sf -X POST \ - -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" \ - --cacert "$K8S_CA" "$API/api/v1/namespaces/$NAMESPACE/secrets" \ - -d @client-secret.json - - rm -f *.key *.csr *.crt *.ext *.srl *.json - echo "PKI secrets created." -{{- end }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index f8e090721..2b1051d06 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -154,32 +154,23 @@ sshHandshake: value: "" # PKI bootstrap via a pre-install/pre-upgrade hook Job. -# Generates a self-signed CA, server TLS secret, and client TLS secret using -# openssl (ECDSA P-256) inside the cluster. Key material is written directly to -# K8s Secrets and never appears in Helm release history. Idempotent: existing -# secrets are left untouched on upgrade. -# Air-gapped environments should override pkiInitJob.image with an image that has -# openssl and curl pre-installed (the default alpine image fetches them at runtime). +# Runs `openshell-gateway generate-certs` to create the server and client TLS +# Secrets in-cluster. Key material is written directly to K8s Secrets and +# never appears in Helm release history. Idempotent: existing secrets are +# left untouched on upgrade. Reuses the gateway image — no extra image to +# mirror in air-gapped environments. +# +# The server certificate already includes the built-in cluster SANs +# (`openshell`, `openshell.openshell.svc`, the cluster.local FQDN, `localhost`, +# `host.docker.internal`, and `127.0.0.1`) baked into the gateway binary. The +# lists below are *additional* SANs appended on top — typically a public +# hostname or load-balancer IP for remote deployments. pkiInitJob: enabled: true - image: - repository: alpine - tag: "3" - pullPolicy: IfNotPresent - # Days until the CA certificate expires. - caValidityDays: 3650 - # Days until server and client certificates expire. - certValidityDays: 3650 - # DNS SANs for the server certificate. - serverDnsNames: - - openshell - - openshell.openshell.svc - - openshell.openshell.svc.cluster.local - - localhost - - host.docker.internal - # IP SANs for the server certificate. - serverIpAddresses: - - 127.0.0.1 + # Extra DNS SANs to append to the server certificate. + serverDnsNames: [] + # Extra IP SANs to append to the server certificate. + serverIpAddresses: [] # cert-manager Certificate/Issuer resources (requires cert-manager CRDs in-cluster). # Uses namespaced Issuers only (no ClusterIssuer). Does not install cert-manager itself.