diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md
index 18d8c241e..623efb2e6 100644
--- a/.agents/skills/helm-dev-environment/SKILL.md
+++ b/.agents/skills/helm-dev-environment/SKILL.md
@@ -57,7 +57,8 @@ mise run helm:skaffold:run
```
Both commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm
-chart. The `pkiInitJob` hook runs on first install to generate mTLS secrets. Envoy Gateway opt-in; see the Optional Add-ons section below.
+chart. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway generate-certs`)
+generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional Add-ons section below.
The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or `kubectl port-forward`.
diff --git a/Cargo.lock b/Cargo.lock
index 28d86ea93..808956cd9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3630,10 +3630,13 @@ dependencies = [
"hyper-util",
"ipnet",
"jsonwebtoken 9.3.1",
+ "k8s-openapi",
+ "kube",
"metrics",
"metrics-exporter-prometheus",
"miette",
"nix",
+ "openshell-bootstrap",
"openshell-core",
"openshell-driver-docker",
"openshell-driver-kubernetes",
diff --git a/architecture/gateway.md b/architecture/gateway.md
index f36878cf1..d89706e64 100644
--- a/architecture/gateway.md
+++ b/architecture/gateway.md
@@ -132,6 +132,32 @@ The same relay pattern backs interactive SSH, command execution, and file sync.
The gateway tracks live sessions in memory and persists session records so
tokens can expire or be revoked.
+## PKI Bootstrap
+
+`openshell-gateway generate-certs` is the one place mTLS materials are
+created. Both deployment paths use it:
+
+| Output mode | Selector | Layout |
+|---|---|---|
+| Kubernetes Secrets | (default) `--namespace`, `--server-secret-name`, `--client-secret-name` | Two `kubernetes.io/tls` Secrets with `tls.crt` / `tls.key` / `ca.crt`. |
+| Filesystem | `--output-dir
` | `/{ca.crt, ca.key, server/tls.{crt,key}, client/tls.{crt,key}}`. Also copies client materials to `$XDG_CONFIG_HOME/openshell/gateways/openshell/mtls/` for CLI auto-discovery. |
+
+On Kubernetes, the Helm chart runs the command via a pre-install/pre-upgrade
+hook Job using the gateway image itself — no separate cert-generation image,
+no extra mirror burden in air-gapped environments. On the RPM gateway, the
+same command runs from the systemd unit's `ExecStartPre` to bootstrap PKI
+into the user's state directory on first start.
+
+Both modes share the same idempotency contract: all targets present → skip;
+partial state → fail with a recovery hint; nothing present → generate and
+write. This guards mTLS continuity across restarts and upgrades while still
+recovering cleanly if an operator deletes everything and starts over.
+
+Operators who manage PKI externally (cert-manager, an enterprise CA, or
+pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`.
+The chart also ships a `certManager.*` path that produces equivalent Secrets
+through cert-manager `Issuer`/`Certificate` resources.
+
## Operational Constraints
- Gateway TLS and client certificate distribution are deployment concerns owned
diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml
index fab20186c..9cba99045 100644
--- a/crates/openshell-server/Cargo.toml
+++ b/crates/openshell-server/Cargo.toml
@@ -15,6 +15,7 @@ name = "openshell-gateway"
path = "src/main.rs"
[dependencies]
+openshell-bootstrap = { path = "../openshell-bootstrap" }
openshell-core = { path = "../openshell-core" }
openshell-driver-docker = { path = "../openshell-driver-docker" }
openshell-driver-kubernetes = { path = "../openshell-driver-kubernetes" }
@@ -24,6 +25,10 @@ openshell-policy = { path = "../openshell-policy" }
openshell-providers = { path = "../openshell-providers" }
openshell-router = { path = "../openshell-router" }
+# Kubernetes client (used by the `generate-certs` subcommand)
+kube = { workspace = true }
+k8s-openapi = { workspace = true }
+
# Async runtime
tokio = { workspace = true }
diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs
new file mode 100644
index 000000000..b9e4d7bd5
--- /dev/null
+++ b/crates/openshell-server/src/certgen.rs
@@ -0,0 +1,525 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! `generate-certs` subcommand: bootstrap mTLS PKI for the gateway.
+//!
+//! Two output modes, dispatched by the presence of `--output-dir`:
+//!
+//! - **Kubernetes mode** (default): create two `kubernetes.io/tls` Secrets
+//! in the supplied namespace. Used by the Helm pre-install hook. Requires
+//! `--namespace`, `--server-secret-name`, `--client-secret-name`.
+//! - **Local mode** (`--output-dir `): write PEMs to a filesystem layout
+//! matching `deploy/rpm/init-pki.sh`. Used by the RPM systemd unit's
+//! `ExecStartPre`. Also copies client materials to
+//! `$XDG_CONFIG_HOME/openshell/gateways/openshell/mtls/` so the local CLI
+//! picks them up automatically.
+//!
+//! Both modes share the same idempotency contract: all targets present →
+//! skip; partial state → error with a recovery hint; nothing present →
+//! generate and write.
+
+use clap::Args;
+use k8s_openapi::ByteString;
+use k8s_openapi::api::core::v1::Secret;
+use kube::Client;
+use kube::api::{Api, ObjectMeta, PostParams};
+use miette::{IntoDiagnostic, Result, WrapErr};
+use openshell_bootstrap::pki::{PkiBundle, generate_pki};
+use openshell_core::paths::{create_dir_restricted, set_file_owner_only};
+use std::collections::BTreeMap;
+use std::path::{Path, PathBuf};
+use tracing::{info, warn};
+use tracing_subscriber::EnvFilter;
+
+#[derive(Args, Debug)]
+pub struct CertgenArgs {
+ /// Write PEMs to a filesystem directory instead of Kubernetes Secrets.
+ /// When set, the kube-related flags are not required.
+ #[arg(long, value_name = "DIR")]
+ output_dir: Option,
+
+ /// Kubernetes namespace to create Secrets in.
+ /// Default comes from `POD_NAMESPACE`, which the Helm hook injects via
+ /// the downward API.
+ #[arg(long, env = "POD_NAMESPACE", required_unless_present = "output_dir")]
+ namespace: Option,
+
+ /// Name of the server TLS Secret (`kubernetes.io/tls`) to create.
+ #[arg(long, required_unless_present = "output_dir")]
+ server_secret_name: Option,
+
+ /// Name of the client TLS Secret (`kubernetes.io/tls`) to create.
+ #[arg(long, required_unless_present = "output_dir")]
+ client_secret_name: Option,
+
+ /// Extra Subject Alternative Name for the server certificate. Repeatable.
+ /// Auto-detected as an IP address or DNS name.
+ #[arg(long = "server-san", value_name = "SAN")]
+ server_sans: Vec,
+
+ /// Print the generated PEM materials to stdout instead of writing them.
+ /// For local debugging.
+ #[arg(long)]
+ dry_run: bool,
+}
+
+pub async fn run(args: CertgenArgs) -> Result<()> {
+ tracing_subscriber::fmt()
+ .with_env_filter(
+ EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
+ )
+ .init();
+
+ let bundle = generate_pki(&args.server_sans)?;
+
+ if args.dry_run {
+ print_bundle(&bundle);
+ return Ok(());
+ }
+
+ if let Some(dir) = args.output_dir.as_deref() {
+ run_local(dir, &bundle)
+ } else {
+ run_kubernetes(&args, &bundle).await
+ }
+}
+
+// ─────────────────────────── Kubernetes mode ───────────────────────────
+
+#[derive(Debug, PartialEq, Eq)]
+enum K8sAction {
+ SkipExists,
+ PartialState,
+ Create,
+}
+
+fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction {
+ match (server_exists, client_exists) {
+ (true, true) => K8sAction::SkipExists,
+ (false, false) => K8sAction::Create,
+ _ => K8sAction::PartialState,
+ }
+}
+
+async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> {
+ let namespace = args
+ .namespace
+ .as_deref()
+ .ok_or_else(|| miette::miette!("--namespace is required (or set POD_NAMESPACE)"))?;
+ let server_name = args
+ .server_secret_name
+ .as_deref()
+ .ok_or_else(|| miette::miette!("--server-secret-name is required"))?;
+ let client_name = args
+ .client_secret_name
+ .as_deref()
+ .ok_or_else(|| miette::miette!("--client-secret-name is required"))?;
+
+ let client = Client::try_default()
+ .await
+ .into_diagnostic()
+ .wrap_err("failed to construct in-cluster Kubernetes client")?;
+ let api: Api = Api::namespaced(client, namespace);
+
+ let server_exists = api
+ .get_opt(server_name)
+ .await
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to read secret {server_name}"))?
+ .is_some();
+ let client_exists = api
+ .get_opt(client_name)
+ .await
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to read secret {client_name}"))?
+ .is_some();
+
+ match decide_k8s(server_exists, client_exists) {
+ K8sAction::SkipExists => {
+ info!(
+ namespace = %namespace,
+ server = %server_name,
+ client = %client_name,
+ "PKI secrets already exist, skipping."
+ );
+ return Ok(());
+ }
+ K8sAction::PartialState => {
+ return Err(miette::miette!(
+ "partial PKI state in namespace {namespace}: exactly one of \
+ {server_name} / {client_name} exists. Recover with: \
+ kubectl delete secret -n {namespace} {server_name} {client_name}",
+ ));
+ }
+ K8sAction::Create => {}
+ }
+
+ let server_secret = tls_secret(
+ server_name,
+ &bundle.server_cert_pem,
+ &bundle.server_key_pem,
+ &bundle.ca_cert_pem,
+ );
+ let client_secret = tls_secret(
+ client_name,
+ &bundle.client_cert_pem,
+ &bundle.client_key_pem,
+ &bundle.ca_cert_pem,
+ );
+
+ api.create(&PostParams::default(), &server_secret)
+ .await
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to create secret {server_name}"))?;
+ api.create(&PostParams::default(), &client_secret)
+ .await
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to create secret {client_name}"))?;
+
+ info!(
+ namespace = %namespace,
+ server = %server_name,
+ client = %client_name,
+ "PKI secrets created."
+ );
+ Ok(())
+}
+
+fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret {
+ let mut data = BTreeMap::new();
+ data.insert(
+ "tls.crt".to_string(),
+ ByteString(crt_pem.as_bytes().to_vec()),
+ );
+ data.insert(
+ "tls.key".to_string(),
+ ByteString(key_pem.as_bytes().to_vec()),
+ );
+ data.insert("ca.crt".to_string(), ByteString(ca_pem.as_bytes().to_vec()));
+ Secret {
+ metadata: ObjectMeta {
+ name: Some(name.to_string()),
+ ..Default::default()
+ },
+ type_: Some("kubernetes.io/tls".to_string()),
+ data: Some(data),
+ ..Default::default()
+ }
+}
+
+// ─────────────────────────────── Local mode ───────────────────────────────
+
+#[derive(Debug, PartialEq, Eq)]
+enum LocalAction {
+ Skip,
+ PartialState,
+ Create,
+}
+
+/// Layout under `` matches `deploy/rpm/init-pki.sh`:
+///
+/// ```text
+/// /ca.crt
+/// /ca.key
+/// /server/tls.crt
+/// /server/tls.key
+/// /client/tls.crt
+/// /client/tls.key
+/// ```
+struct LocalPaths {
+ ca_crt: PathBuf,
+ ca_key: PathBuf,
+ server_dir: PathBuf,
+ server_crt: PathBuf,
+ server_key: PathBuf,
+ client_dir: PathBuf,
+ client_crt: PathBuf,
+ client_key: PathBuf,
+}
+
+impl LocalPaths {
+ fn resolve(dir: &Path) -> Self {
+ let server_dir = dir.join("server");
+ let client_dir = dir.join("client");
+ Self {
+ ca_crt: dir.join("ca.crt"),
+ ca_key: dir.join("ca.key"),
+ server_crt: server_dir.join("tls.crt"),
+ server_key: server_dir.join("tls.key"),
+ server_dir,
+ client_crt: client_dir.join("tls.crt"),
+ client_key: client_dir.join("tls.key"),
+ client_dir,
+ }
+ }
+
+ fn all_files(&self) -> [&Path; 6] {
+ [
+ &self.ca_crt,
+ &self.ca_key,
+ &self.server_crt,
+ &self.server_key,
+ &self.client_crt,
+ &self.client_key,
+ ]
+ }
+
+ fn existence_count(&self) -> usize {
+ self.all_files().iter().filter(|p| p.exists()).count()
+ }
+}
+
+fn decide_local(present: usize) -> LocalAction {
+ match present {
+ 6 => LocalAction::Skip,
+ 0 => LocalAction::Create,
+ _ => LocalAction::PartialState,
+ }
+}
+
+fn run_local(dir: &Path, bundle: &PkiBundle) -> Result<()> {
+ let paths = LocalPaths::resolve(dir);
+
+ match decide_local(paths.existence_count()) {
+ LocalAction::Skip => {
+ info!(dir = %dir.display(), "PKI files already exist, skipping.");
+ }
+ LocalAction::PartialState => {
+ return Err(miette::miette!(
+ "partial PKI state in {dir}: some files exist but not all. \
+ Recover with: rm -rf {dir} (the gateway will regenerate on next start)",
+ dir = dir.display(),
+ ));
+ }
+ LocalAction::Create => {
+ write_local_bundle(dir, bundle, &paths)?;
+ info!(dir = %dir.display(), "PKI files created.");
+ }
+ }
+
+ // Always make sure the CLI auto-discovery copy is in place. This
+ // self-heals the case where the operator wiped ~/.config/openshell but
+ // left the gateway state directory intact.
+ if let Err(e) = openshell_bootstrap::mtls::store_pki_bundle("openshell", bundle) {
+ warn!(error = %e, "failed to copy client mTLS materials for CLI auto-discovery");
+ }
+
+ Ok(())
+}
+
+fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Result<()> {
+ // Stage to a sibling tmp dir so individual renames into the final layout
+ // are atomic on the same filesystem.
+ let temp = sibling_temp_dir(dir);
+ if temp.exists() {
+ std::fs::remove_dir_all(&temp)
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to remove stale {}", temp.display()))?;
+ }
+
+ let temp_server = temp.join("server");
+ let temp_client = temp.join("client");
+ create_dir_restricted(&temp)?;
+ create_dir_restricted(&temp_server)?;
+ create_dir_restricted(&temp_client)?;
+
+ write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?;
+ write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?;
+ write_pem(&temp_server.join("tls.crt"), &bundle.server_cert_pem, false)?;
+ write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?;
+ write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?;
+ write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?;
+
+ // Final destination (might not exist yet on first run).
+ create_dir_restricted(dir)?;
+ create_dir_restricted(&paths.server_dir)?;
+ create_dir_restricted(&paths.client_dir)?;
+
+ let renames: [(PathBuf, &Path); 6] = [
+ (temp.join("ca.crt"), paths.ca_crt.as_path()),
+ (temp.join("ca.key"), paths.ca_key.as_path()),
+ (temp_server.join("tls.crt"), paths.server_crt.as_path()),
+ (temp_server.join("tls.key"), paths.server_key.as_path()),
+ (temp_client.join("tls.crt"), paths.client_crt.as_path()),
+ (temp_client.join("tls.key"), paths.client_key.as_path()),
+ ];
+ for (from, to) in &renames {
+ std::fs::rename(from, to)
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to move {} -> {}", from.display(), to.display()))?;
+ }
+
+ let _ = std::fs::remove_dir_all(&temp);
+ Ok(())
+}
+
+fn write_pem(path: &Path, contents: &str, owner_only: bool) -> Result<()> {
+ std::fs::write(path, contents)
+ .into_diagnostic()
+ .wrap_err_with(|| format!("failed to write {}", path.display()))?;
+ if owner_only {
+ set_file_owner_only(path)?;
+ }
+ Ok(())
+}
+
+fn sibling_temp_dir(dir: &Path) -> PathBuf {
+ // Use a sibling so std::fs::rename succeeds (same filesystem).
+ let mut name = dir
+ .file_name()
+ .map(std::ffi::OsStr::to_os_string)
+ .unwrap_or_default();
+ name.push(".certgen.tmp");
+ dir.with_file_name(name)
+}
+
+// ────────────────────────────── Shared utility ─────────────────────────────
+
+fn print_bundle(bundle: &PkiBundle) {
+ println!("# CA certificate\n{}", bundle.ca_cert_pem);
+ println!("# Server certificate\n{}", bundle.server_cert_pem);
+ println!("# Server key\n{}", bundle.server_key_pem);
+ println!("# Client certificate\n{}", bundle.client_cert_pem);
+ println!("# Client key\n{}", bundle.client_key_pem);
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{
+ K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, sibling_temp_dir, tls_secret,
+ write_local_bundle,
+ };
+ use openshell_bootstrap::pki::generate_pki;
+ use std::path::Path;
+
+ // ── Kubernetes-mode decision ──
+
+ #[test]
+ fn decide_k8s_skip_when_both_exist() {
+ assert_eq!(decide_k8s(true, true), K8sAction::SkipExists);
+ }
+
+ #[test]
+ fn decide_k8s_create_when_neither_exists() {
+ assert_eq!(decide_k8s(false, false), K8sAction::Create);
+ }
+
+ #[test]
+ fn decide_k8s_partial_when_only_server_exists() {
+ assert_eq!(decide_k8s(true, false), K8sAction::PartialState);
+ }
+
+ #[test]
+ fn decide_k8s_partial_when_only_client_exists() {
+ assert_eq!(decide_k8s(false, true), K8sAction::PartialState);
+ }
+
+ #[test]
+ fn tls_secret_has_kubernetes_io_tls_type_and_three_keys() {
+ let s = tls_secret("foo", "CRT-PEM", "KEY-PEM", "CA-PEM");
+ assert_eq!(s.metadata.name.as_deref(), Some("foo"));
+ assert_eq!(s.type_.as_deref(), Some("kubernetes.io/tls"));
+ let data = s.data.expect("data set");
+ assert_eq!(data.len(), 3);
+ assert_eq!(data["tls.crt"].0, b"CRT-PEM");
+ assert_eq!(data["tls.key"].0, b"KEY-PEM");
+ assert_eq!(data["ca.crt"].0, b"CA-PEM");
+ }
+
+ // ── Local-mode decision ──
+
+ #[test]
+ fn decide_local_skip_when_all_six_present() {
+ assert_eq!(decide_local(6), LocalAction::Skip);
+ }
+
+ #[test]
+ fn decide_local_create_when_none_present() {
+ assert_eq!(decide_local(0), LocalAction::Create);
+ }
+
+ #[test]
+ fn decide_local_partial_for_any_count_in_between() {
+ for n in 1..=5 {
+ assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}");
+ }
+ }
+
+ // ── Local-mode layout & writes ──
+
+ #[test]
+ fn local_paths_resolve_matches_init_pki_layout() {
+ let p = LocalPaths::resolve(Path::new("/tmp/openshell/tls"));
+ assert_eq!(p.ca_crt, Path::new("/tmp/openshell/tls/ca.crt"));
+ assert_eq!(p.ca_key, Path::new("/tmp/openshell/tls/ca.key"));
+ assert_eq!(p.server_crt, Path::new("/tmp/openshell/tls/server/tls.crt"));
+ assert_eq!(p.server_key, Path::new("/tmp/openshell/tls/server/tls.key"));
+ assert_eq!(p.client_crt, Path::new("/tmp/openshell/tls/client/tls.crt"));
+ assert_eq!(p.client_key, Path::new("/tmp/openshell/tls/client/tls.key"));
+ }
+
+ #[test]
+ fn sibling_temp_dir_is_adjacent_to_target() {
+ assert_eq!(
+ sibling_temp_dir(Path::new("/var/lib/openshell/tls")),
+ Path::new("/var/lib/openshell/tls.certgen.tmp")
+ );
+ }
+
+ #[test]
+ fn write_local_bundle_writes_six_files_and_removes_temp() {
+ let parent = tempfile::tempdir().expect("tempdir");
+ let dir = parent.path().join("tls");
+ let bundle = generate_pki(&[]).expect("generate_pki");
+ let paths = LocalPaths::resolve(&dir);
+
+ write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle");
+
+ for f in paths.all_files() {
+ assert!(f.is_file(), "missing {}", f.display());
+ }
+ assert!(
+ !sibling_temp_dir(&dir).exists(),
+ "temp dir should be cleaned up"
+ );
+
+ // Spot-check contents.
+ let ca = std::fs::read_to_string(&paths.ca_crt).unwrap();
+ assert!(ca.contains("BEGIN CERTIFICATE"));
+ let server_key = std::fs::read_to_string(&paths.server_key).unwrap();
+ assert!(server_key.contains("BEGIN PRIVATE KEY"));
+ }
+
+ #[cfg(unix)]
+ #[test]
+ fn write_local_bundle_sets_owner_only_on_keys() {
+ use std::os::unix::fs::PermissionsExt;
+ let parent = tempfile::tempdir().expect("tempdir");
+ let dir = parent.path().join("tls");
+ let bundle = generate_pki(&[]).expect("generate_pki");
+ let paths = LocalPaths::resolve(&dir);
+
+ write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle");
+
+ for key in [&paths.ca_key, &paths.server_key, &paths.client_key] {
+ let mode = std::fs::metadata(key).unwrap().permissions().mode() & 0o777;
+ assert_eq!(mode, 0o600, "key {} has mode {:o}", key.display(), mode);
+ }
+ }
+
+ #[test]
+ fn write_local_bundle_recovers_from_stale_temp_dir() {
+ let parent = tempfile::tempdir().expect("tempdir");
+ let dir = parent.path().join("tls");
+ let stale = sibling_temp_dir(&dir);
+ std::fs::create_dir_all(&stale).unwrap();
+ std::fs::write(stale.join("garbage"), "stale").unwrap();
+
+ let bundle = generate_pki(&[]).expect("generate_pki");
+ let paths = LocalPaths::resolve(&dir);
+ write_local_bundle(&dir, &bundle, &paths).expect("write_local_bundle");
+
+ assert!(paths.ca_crt.is_file());
+ assert!(!stale.exists(), "stale temp dir should be removed");
+ }
+}
diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs
index ccc08cf2b..b928be2e0 100644
--- a/crates/openshell-server/src/cli.rs
+++ b/crates/openshell-server/src/cli.rs
@@ -15,14 +15,34 @@ use std::path::PathBuf;
use tracing::info;
use tracing_subscriber::EnvFilter;
+use crate::certgen;
use crate::compute::{DockerComputeConfig, VmComputeConfig};
use crate::{run_server, tracing_bus::TracingLogBus};
/// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing.
+///
+/// Top-level CLI. When invoked without a subcommand the binary runs the
+/// gateway server using `RunArgs`. The `generate-certs` subcommand is used by
+/// the Helm pre-install hook to bootstrap mTLS Secrets.
#[derive(Parser, Debug)]
#[command(version = openshell_core::VERSION)]
#[command(about = "OpenShell gRPC/HTTP server", long_about = None)]
-struct Args {
+struct Cli {
+ #[command(subcommand)]
+ command: Option,
+
+ #[command(flatten)]
+ run: RunArgs,
+}
+
+#[derive(clap::Subcommand, Debug)]
+enum Commands {
+ /// Generate mTLS PKI and write Kubernetes Secrets (Helm pre-install hook).
+ GenerateCerts(certgen::CertgenArgs),
+}
+
+#[derive(clap::Args, Debug)]
+struct RunArgs {
/// IP address to bind the server, health, and metrics listeners to.
#[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")]
bind_address: IpAddr,
@@ -58,8 +78,12 @@ struct Args {
tls_client_ca: Option,
/// Database URL for persistence.
- #[arg(long, env = "OPENSHELL_DB_URL", required = true)]
- db_url: String,
+ ///
+ /// Required when running the gateway. Validated at the call site rather
+ /// than as a clap-level requirement so the `generate-certs` subcommand
+ /// (which does not need a database) can run without it.
+ #[arg(long, env = "OPENSHELL_DB_URL")]
+ db_url: Option,
/// Compute drivers configured for this gateway.
///
@@ -279,7 +303,7 @@ struct Args {
}
pub fn command() -> Command {
- Args::command()
+ Cli::command()
.name("openshell-gateway")
.bin_name("openshell-gateway")
}
@@ -289,12 +313,15 @@ pub async fn run_cli() -> Result<()> {
.install_default()
.map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?;
- let args = Args::from_arg_matches(&command().get_matches()).expect("clap validated args");
+ let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args");
- Box::pin(run_from_args(args)).await
+ match cli.command {
+ Some(Commands::GenerateCerts(args)) => certgen::run(args).await,
+ None => Box::pin(run_from_args(cli.run)).await,
+ }
}
-async fn run_from_args(args: Args) -> Result<()> {
+async fn run_from_args(args: RunArgs) -> Result<()> {
let tracing_log_bus = TracingLogBus::new();
tracing_log_bus.install_subscriber(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)),
@@ -326,6 +353,10 @@ async fn run_from_args(args: Args) -> Result<()> {
})
};
+ let db_url = args
+ .db_url
+ .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?;
+
let mut config = openshell_core::Config::new(tls)
.with_bind_address(bind)
.with_log_level(&args.log_level);
@@ -359,7 +390,7 @@ async fn run_from_args(args: Args) -> Result<()> {
}
config = config
- .with_database_url(args.db_url)
+ .with_database_url(db_url)
.with_compute_drivers(args.drivers)
.with_sandbox_namespace(args.sandbox_namespace)
.with_ssh_gateway_host(args.ssh_gateway_host)
@@ -444,7 +475,7 @@ fn parse_compute_driver(value: &str) -> std::result::Result at the clap level so subcommand parsing
+ // does not require it. The Run path validates it inside
+ // run_from_args. This test asserts the parse step succeeds with no
+ // --db-url, mirroring what the runtime check sees.
+ let _lock = ENV_LOCK
+ .lock()
+ .unwrap_or_else(std::sync::PoisonError::into_inner);
+ let _g = EnvVarGuard::remove("OPENSHELL_DB_URL");
+
+ let cli = Cli::try_parse_from(["openshell-gateway"]).expect("parses without --db-url");
+ assert!(cli.command.is_none());
+ assert!(cli.run.db_url.is_none());
}
}
diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs
index a80301c12..f63cf1915 100644
--- a/crates/openshell-server/src/lib.rs
+++ b/crates/openshell-server/src/lib.rs
@@ -20,6 +20,7 @@
//! [`compute::vm`]; keep this file driver-agnostic going forward.
mod auth;
+pub mod certgen;
pub mod cli;
mod compute;
mod grpc;
diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md
index ee7565f29..cc856731d 100644
--- a/deploy/helm/openshell/README.md
+++ b/deploy/helm/openshell/README.md
@@ -52,3 +52,21 @@ See [`values.yaml`](values.yaml) for configurable values. Selected overlays:
- [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration
- [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration
- [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration
+
+## PKI bootstrap
+
+By default, a pre-install/pre-upgrade hook Job runs `openshell-gateway generate-certs`
+to create the gateway's server and client mTLS Secrets. The Job uses the gateway image
+itself, so air-gapped environments only need to mirror that one image (no separate
+openssl/alpine sidecar).
+
+The Job is idempotent:
+
+- Both target Secrets exist → log and exit 0.
+- Exactly one exists → fail with `kubectl delete secret -n ` recovery hint.
+- Neither exists → generate a CA, server cert, and client cert; POST both `kubernetes.io/tls`
+ Secrets (`tls.crt`, `tls.key`, `ca.crt`).
+
+Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager,
+external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the
+cert-manager alternative.
diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml
new file mode 100644
index 000000000..d8136d581
--- /dev/null
+++ b/deploy/helm/openshell/templates/certgen.yaml
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if and .Values.pkiInitJob.enabled .Values.certManager.enabled }}
+{{- fail "pkiInitJob.enabled and certManager.enabled cannot both be true; disable one to avoid conflicting PKI sources." }}
+{{- end }}
+{{- if .Values.pkiInitJob.enabled }}
+{{- $hookName := printf "%s-certgen" (include "openshell.fullname" .) }}
+{{- $ns := .Release.Namespace }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ $hookName }}
+ namespace: {{ $ns }}
+ labels:
+ {{- include "openshell.labels" . | nindent 4 }}
+ annotations:
+ helm.sh/hook: pre-install,pre-upgrade
+ helm.sh/hook-weight: "-30"
+ helm.sh/hook-delete-policy: before-hook-creation
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: {{ $hookName }}
+ namespace: {{ $ns }}
+ labels:
+ {{- include "openshell.labels" . | nindent 4 }}
+ annotations:
+ helm.sh/hook: pre-install,pre-upgrade
+ helm.sh/hook-weight: "-30"
+ helm.sh/hook-delete-policy: before-hook-creation
+rules:
+ - apiGroups: [""]
+ resources: ["secrets"]
+ verbs: ["get", "create"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: {{ $hookName }}
+ namespace: {{ $ns }}
+ labels:
+ {{- include "openshell.labels" . | nindent 4 }}
+ annotations:
+ helm.sh/hook: pre-install,pre-upgrade
+ helm.sh/hook-weight: "-30"
+ helm.sh/hook-delete-policy: before-hook-creation
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: Role
+ name: {{ $hookName }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ $hookName }}
+ namespace: {{ $ns }}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: {{ $hookName }}
+ namespace: {{ $ns }}
+ labels:
+ {{- include "openshell.labels" . | nindent 4 }}
+ annotations:
+ helm.sh/hook: pre-install,pre-upgrade
+ helm.sh/hook-weight: "-20"
+ helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
+spec:
+ backoffLimit: 3
+ activeDeadlineSeconds: 120
+ ttlSecondsAfterFinished: 300
+ template:
+ metadata:
+ labels:
+ {{- include "openshell.selectorLabels" . | nindent 8 }}
+ spec:
+ restartPolicy: OnFailure
+ serviceAccountName: {{ $hookName }}
+ {{- with .Values.imagePullSecrets }}
+ imagePullSecrets:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ containers:
+ - name: certgen
+ image: {{ include "openshell.image" . | quote }}
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+ securityContext:
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+ env:
+ - name: POD_NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ command: ["openshell-gateway"]
+ args:
+ - generate-certs
+ - --server-secret-name={{ .Values.server.tls.certSecretName }}
+ - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }}
+ {{- range .Values.pkiInitJob.serverDnsNames }}
+ - --server-san={{ . }}
+ {{- end }}
+ {{- range .Values.pkiInitJob.serverIpAddresses }}
+ - --server-san={{ . }}
+ {{- end }}
+{{- end }}
diff --git a/deploy/helm/openshell/templates/pki-hook.yaml b/deploy/helm/openshell/templates/pki-hook.yaml
deleted file mode 100644
index c5e83c734..000000000
--- a/deploy/helm/openshell/templates/pki-hook.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-{{- if and .Values.pkiInitJob.enabled .Values.certManager.enabled }}
-{{- fail "pkiInitJob.enabled and certManager.enabled cannot both be true; disable one to avoid conflicting PKI sources." }}
-{{- end }}
-{{- if .Values.pkiInitJob.enabled }}
-{{- $hookName := printf "%s-pki-hook" (include "openshell.fullname" .) }}
-{{- $ns := .Release.Namespace }}
-{{- $serverSecret := .Values.server.tls.certSecretName }}
-{{- $clientSecret := .Values.server.tls.clientTlsSecretName }}
-{{- $sanParts := list }}
-{{- range .Values.pkiInitJob.serverDnsNames }}{{- $sanParts = append $sanParts (printf "DNS:%s" .) }}{{- end }}
-{{- range .Values.pkiInitJob.serverIpAddresses }}{{- $sanParts = append $sanParts (printf "IP:%s" .) }}{{- end }}
-{{- $serverSans := join "," $sanParts }}
-apiVersion: v1
-kind: ServiceAccount
-metadata:
- name: {{ $hookName }}
- namespace: {{ $ns }}
- labels:
- {{- include "openshell.labels" . | nindent 4 }}
- annotations:
- helm.sh/hook: pre-install,pre-upgrade
- helm.sh/hook-weight: "-30"
- helm.sh/hook-delete-policy: before-hook-creation
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
- name: {{ $hookName }}
- namespace: {{ $ns }}
- labels:
- {{- include "openshell.labels" . | nindent 4 }}
- annotations:
- helm.sh/hook: pre-install,pre-upgrade
- helm.sh/hook-weight: "-30"
- helm.sh/hook-delete-policy: before-hook-creation
-rules:
- - apiGroups: [""]
- resources: ["secrets"]
- verbs: ["get", "create"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
- name: {{ $hookName }}
- namespace: {{ $ns }}
- labels:
- {{- include "openshell.labels" . | nindent 4 }}
- annotations:
- helm.sh/hook: pre-install,pre-upgrade
- helm.sh/hook-weight: "-30"
- helm.sh/hook-delete-policy: before-hook-creation
-roleRef:
- apiGroup: rbac.authorization.k8s.io
- kind: Role
- name: {{ $hookName }}
-subjects:
- - kind: ServiceAccount
- name: {{ $hookName }}
- namespace: {{ $ns }}
----
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: {{ $hookName }}
- namespace: {{ $ns }}
- labels:
- {{- include "openshell.labels" . | nindent 4 }}
- annotations:
- helm.sh/hook: pre-install,pre-upgrade
- helm.sh/hook-weight: "-20"
- helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
-spec:
- backoffLimit: 3
- activeDeadlineSeconds: 120
- ttlSecondsAfterFinished: 300
- template:
- metadata:
- labels:
- {{- include "openshell.selectorLabels" . | nindent 8 }}
- spec:
- restartPolicy: OnFailure
- serviceAccountName: {{ $hookName }}
- containers:
- - name: pki-gen
- image: {{ .Values.pkiInitJob.image.repository }}:{{ .Values.pkiInitJob.image.tag }}
- imagePullPolicy: {{ .Values.pkiInitJob.image.pullPolicy }}
- securityContext:
- allowPrivilegeEscalation: false
- capabilities:
- drop:
- - ALL
- env:
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: metadata.namespace
- - name: SERVER_SECRET
- value: {{ $serverSecret | quote }}
- - name: CLIENT_SECRET
- value: {{ $clientSecret | quote }}
- - name: CA_DAYS
- value: {{ .Values.pkiInitJob.caValidityDays | quote }}
- - name: CERT_DAYS
- value: {{ .Values.pkiInitJob.certValidityDays | quote }}
- - name: SERVER_SANS
- value: {{ $serverSans | quote }}
- command:
- - /bin/sh
- - -c
- - |
- set -eu
- apk add --no-cache openssl curl >/dev/null 2>&1
-
- TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
- K8S_CA=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- API=https://kubernetes.default.svc
-
- # Idempotency: skip only when both TLS secrets already exist.
- # Checking one is insufficient — a partial cleanup can leave one half
- # of the pair behind, which would cause mTLS to fail at runtime.
- HTTP_SERVER=$(curl -s -o /dev/null -w "%{http_code}" \
- -H "Authorization: Bearer $TOKEN" --cacert "$K8S_CA" \
- "$API/api/v1/namespaces/$NAMESPACE/secrets/$SERVER_SECRET")
- HTTP_CLIENT=$(curl -s -o /dev/null -w "%{http_code}" \
- -H "Authorization: Bearer $TOKEN" --cacert "$K8S_CA" \
- "$API/api/v1/namespaces/$NAMESPACE/secrets/$CLIENT_SECRET")
- if [ "$HTTP_SERVER" = "200" ] && [ "$HTTP_CLIENT" = "200" ]; then
- echo "PKI secrets already exist, skipping."
- exit 0
- fi
- if [ "$HTTP_SERVER" = "200" ] || [ "$HTTP_CLIENT" = "200" ]; then
- echo "ERROR: partial PKI state — one secret exists but not both." >&2
- echo "To recover: kubectl delete secret -n $NAMESPACE $SERVER_SECRET $CLIENT_SECRET" >&2
- exit 1
- fi
-
- cd /tmp
-
- # CA (ECDSA P-256)
- openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out ca.key 2>/dev/null
- openssl req -new -x509 -sha256 -key ca.key -out ca.crt \
- -days "$CA_DAYS" -subj "/O=openshell/CN=openshell-ca" \
- -addext "basicConstraints=critical,CA:TRUE,pathlen:0" \
- -addext "keyUsage=critical,keyCertSign,cRLSign"
-
- # Server cert (ECDSA P-256)
- printf "[ext]\nsubjectAltName=%s\nextendedKeyUsage=serverAuth\nkeyUsage=digitalSignature,keyEncipherment\n" \
- "$SERVER_SANS" > server.ext
- openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out server.key 2>/dev/null
- openssl req -new -sha256 -key server.key -out server.csr -subj "/CN=openshell-server"
- openssl x509 -req -sha256 -in server.csr -CA ca.crt -CAkey ca.key \
- -CAcreateserial -days "$CERT_DAYS" -extensions ext -extfile server.ext -out server.crt
-
- # Client cert (ECDSA P-256)
- printf "[ext]\nextendedKeyUsage=clientAuth\nkeyUsage=digitalSignature,keyEncipherment\n" \
- > client.ext
- openssl genpkey -algorithm EC -pkeyopt ec_paramgen_curve:P-256 -out client.key 2>/dev/null
- openssl req -new -sha256 -key client.key -out client.csr -subj "/CN=openshell-client"
- openssl x509 -req -sha256 -in client.csr -CA ca.crt -CAkey ca.key \
- -CAcreateserial -days "$CERT_DAYS" -extensions ext -extfile client.ext -out client.crt
-
- CA_B64=$(base64 -w0 ca.crt)
- SERVER_CRT_B64=$(base64 -w0 server.crt)
- SERVER_KEY_B64=$(base64 -w0 server.key)
- CLIENT_CRT_B64=$(base64 -w0 client.crt)
- CLIENT_KEY_B64=$(base64 -w0 client.key)
-
- # Create server TLS secret
- printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"%s","namespace":"%s"},"type":"kubernetes.io/tls","data":{"tls.crt":"%s","tls.key":"%s","ca.crt":"%s"}}\n' \
- "$SERVER_SECRET" "$NAMESPACE" \
- "$SERVER_CRT_B64" "$SERVER_KEY_B64" "$CA_B64" > server-secret.json
- curl -sf -X POST \
- -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" \
- --cacert "$K8S_CA" "$API/api/v1/namespaces/$NAMESPACE/secrets" \
- -d @server-secret.json
-
- # Create client TLS secret
- printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"%s","namespace":"%s"},"type":"kubernetes.io/tls","data":{"tls.crt":"%s","tls.key":"%s","ca.crt":"%s"}}\n' \
- "$CLIENT_SECRET" "$NAMESPACE" \
- "$CLIENT_CRT_B64" "$CLIENT_KEY_B64" "$CA_B64" > client-secret.json
- curl -sf -X POST \
- -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" \
- --cacert "$K8S_CA" "$API/api/v1/namespaces/$NAMESPACE/secrets" \
- -d @client-secret.json
-
- rm -f *.key *.csr *.crt *.ext *.srl *.json
- echo "PKI secrets created."
-{{- end }}
diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml
index f8e090721..2b1051d06 100644
--- a/deploy/helm/openshell/values.yaml
+++ b/deploy/helm/openshell/values.yaml
@@ -154,32 +154,23 @@ sshHandshake:
value: ""
# PKI bootstrap via a pre-install/pre-upgrade hook Job.
-# Generates a self-signed CA, server TLS secret, and client TLS secret using
-# openssl (ECDSA P-256) inside the cluster. Key material is written directly to
-# K8s Secrets and never appears in Helm release history. Idempotent: existing
-# secrets are left untouched on upgrade.
-# Air-gapped environments should override pkiInitJob.image with an image that has
-# openssl and curl pre-installed (the default alpine image fetches them at runtime).
+# Runs `openshell-gateway generate-certs` to create the server and client TLS
+# Secrets in-cluster. Key material is written directly to K8s Secrets and
+# never appears in Helm release history. Idempotent: existing secrets are
+# left untouched on upgrade. Reuses the gateway image — no extra image to
+# mirror in air-gapped environments.
+#
+# The server certificate already includes the built-in cluster SANs
+# (`openshell`, `openshell.openshell.svc`, the cluster.local FQDN, `localhost`,
+# `host.docker.internal`, and `127.0.0.1`) baked into the gateway binary. The
+# lists below are *additional* SANs appended on top — typically a public
+# hostname or load-balancer IP for remote deployments.
pkiInitJob:
enabled: true
- image:
- repository: alpine
- tag: "3"
- pullPolicy: IfNotPresent
- # Days until the CA certificate expires.
- caValidityDays: 3650
- # Days until server and client certificates expire.
- certValidityDays: 3650
- # DNS SANs for the server certificate.
- serverDnsNames:
- - openshell
- - openshell.openshell.svc
- - openshell.openshell.svc.cluster.local
- - localhost
- - host.docker.internal
- # IP SANs for the server certificate.
- serverIpAddresses:
- - 127.0.0.1
+ # Extra DNS SANs to append to the server certificate.
+ serverDnsNames: []
+ # Extra IP SANs to append to the server certificate.
+ serverIpAddresses: []
# cert-manager Certificate/Issuer resources (requires cert-manager CRDs in-cluster).
# Uses namespaced Issuers only (no ClusterIssuer). Does not install cert-manager itself.