diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 702cda132..b883631b7 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -184,7 +184,7 @@ Component images (server, sandbox) can reach kubelet via two paths: **Local/external pull mode** (default local via `mise run cluster`): Local images are tagged to the configured local registry base (default `127.0.0.1:5000/openshell/*`), pushed to that registry, and pulled by k3s via `registries.yaml` mirror endpoint (typically `host.docker.internal:5000`). The `cluster` task pushes prebuilt local tags (`openshell/*:dev`, falling back to `localhost:5000/openshell/*:dev` or `127.0.0.1:5000/openshell/*:dev`). -Gateway image builds now stage a partial Rust workspace from `deploy/docker/Dockerfile.images`. If cargo fails with a missing manifest under `/build/crates/...`, or an imported symbol exists locally but is missing in the image build, verify that every current gateway dependency crate (including `openshell-driver-kubernetes` and `openshell-ocsf`) is copied into the staged workspace there. +Gateway image builds now stage a partial Rust workspace from `deploy/docker/Dockerfile.images`. If cargo fails with a missing manifest under `/build/crates/...`, or an imported symbol exists locally but is missing in the image build, verify that every current gateway dependency crate (including `openshell-driver-docker`, `openshell-driver-kubernetes`, and `openshell-ocsf`) is copied into the staged workspace there. ```bash # Verify image refs currently used by openshell deployment diff --git a/AGENTS.md b/AGENTS.md index 8968c5296..858e104f5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,7 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-tui/` | Terminal UI | Ratatui-based dashboard for monitoring | | `crates/openshell-vm/` | MicroVM runtime | Experimental, work-in-progress libkrun-based VM execution | | `crates/openshell-driver-kubernetes/` | Kubernetes compute driver | In-process `ComputeDriver` backend for K8s sandbox pods | +| `crates/openshell-driver-docker/` | Docker compute driver | In-process `ComputeDriver` backend for local Docker sandbox containers | | `crates/openshell-driver-vm/` | VM compute driver | Standalone libkrun-backed `ComputeDriver` subprocess (embeds its own rootfs + runtime) | | `python/openshell/` | Python SDK | Python bindings and CLI packaging | | `proto/` | Protobuf definitions | gRPC service contracts | diff --git a/Cargo.lock b/Cargo.lock index 967cce7f9..e58168a8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3137,6 +3137,23 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-driver-docker" +version = "0.0.0" +dependencies = [ + "bollard", + "bytes", + "futures", + "openshell-core", + "tar", + "tempfile", + "tokio", + "tokio-stream", + "tonic", + "tracing", + "url", +] + [[package]] name = "openshell-driver-kubernetes" version = "0.0.0" @@ -3334,6 +3351,7 @@ dependencies = [ "metrics-exporter-prometheus", "miette", "openshell-core", + "openshell-driver-docker", "openshell-driver-kubernetes", "openshell-driver-podman", "openshell-ocsf", @@ -3364,7 +3382,6 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", - "url", "uuid", "wiremock", ] diff --git a/architecture/gateway.md b/architecture/gateway.md index 5fb82717a..e83640a43 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -76,6 +76,7 @@ graph TD | Persistence | `crates/openshell-server/src/persistence/mod.rs` | `Store` enum (SQLite/Postgres), generic object CRUD, protobuf codec | | Compute runtime | `crates/openshell-server/src/compute/mod.rs` | `ComputeRuntime`, gateway-owned sandbox lifecycle orchestration over a compute backend | | Compute driver: Kubernetes | `crates/openshell-driver-kubernetes/src/driver.rs` | Kubernetes CRD create/delete/watch, pod template translation | +| Compute driver: Docker | `crates/openshell-driver-docker/src/lib.rs` | Local Docker container create/stop/delete/watch | | Compute driver: VM | `crates/openshell-driver-vm/src/driver.rs` | Per-sandbox microVM create/delete/watch, supervisor-only guest boot | | Sandbox index | `crates/openshell-server/src/sandbox_index.rs` | `SandboxIndex` -- in-memory name/pod-to-id correlation | | Watch bus | `crates/openshell-server/src/sandbox_watch.rs` | `SandboxWatchBus` -- in-memory broadcast for persisted sandbox updates | @@ -103,6 +104,7 @@ The gateway boots in `cli::run_cli` (`crates/openshell-server/src/cli.rs`) and p 1. Connect to the persistence store (`Store::connect`), which auto-detects SQLite vs Postgres from the URL prefix and runs migrations. 2. Create `ComputeRuntime` with a `ComputeDriver` implementation selected by `OPENSHELL_DRIVERS`: - `kubernetes` wraps `KubernetesComputeDriver` in `ComputeDriverService`, so the gateway uses the `openshell.compute.v1.ComputeDriver` RPC surface even without transport. + - `docker` constructs `openshell-driver-docker` in-process and manages local containers labeled with the configured sandbox namespace. - `vm` spawns the standalone `openshell-driver-vm` binary as a local compute-driver process, resolves it from `--driver-dir`, conventional libexec install paths, or a sibling of the gateway binary, connects to it over a Unix domain socket, and keeps the libkrun/rootfs runtime out of the gateway binary. 3. Build `ServerState` (shared via `Arc` across all handlers), including a fresh `SupervisorSessionRegistry`. 4. **Spawn background tasks**: @@ -116,7 +118,7 @@ The gateway boots in `cli::run_cli` (`crates/openshell-server/src/cli.rs`) and p ## Configuration -All configuration is via CLI flags with environment variable fallbacks. The `--db-url` and `--ssh-handshake-secret` flags are required. +All configuration is via CLI flags with environment variable fallbacks. The `--db-url` flag is required. The `--ssh-handshake-secret` flag is required for non-Docker drivers; Docker sandboxes do not receive a handshake secret. | Flag | Env Var | Default | Description | |------|---------|---------|-------------| @@ -132,7 +134,7 @@ All configuration is via CLI flags with environment variable fallbacks. The `--d | `--sandbox-namespace` | `OPENSHELL_SANDBOX_NAMESPACE` | `default` | Kubernetes namespace for sandbox CRDs | | `--sandbox-image` | `OPENSHELL_SANDBOX_IMAGE` | None | Default container image for sandbox pods | | `--grpc-endpoint` | `OPENSHELL_GRPC_ENDPOINT` | None | gRPC endpoint reachable from within the cluster (for supervisor callbacks) | -| `--drivers` | `OPENSHELL_DRIVERS` | `kubernetes` | Compute backend to use. Current options are `kubernetes` and `vm`. | +| `--drivers` | `OPENSHELL_DRIVERS` | `kubernetes` | Compute backend to use. Current options are `kubernetes`, `docker`, and `vm`. | | `--vm-driver-state-dir` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Host directory for VM sandbox rootfs, console logs, and runtime state | | `--driver-dir` | `OPENSHELL_DRIVER_DIR` | unset | Override directory for `openshell-driver-vm`. When unset, the gateway searches `~/.local/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`, then a sibling binary. | | `--vm-krun-log-level` | `OPENSHELL_VM_KRUN_LOG_LEVEL` | `1` | libkrun log level for VM helper processes | @@ -600,6 +602,18 @@ The Helm chart template is at `deploy/helm/openshell/templates/statefulset.yaml` The gateway reaches the sandbox exclusively through the supervisor-initiated `ConnectSupervisor` session, so the driver never returns sandbox network endpoints. +### Docker Driver + +The Docker driver (`crates/openshell-driver-docker/src/lib.rs`) is an in-process compute backend for local standalone gateways. It creates one Docker container per sandbox, labels each container with `openshell.ai/managed-by=openshell`, `openshell.ai/sandbox-id`, `openshell.ai/sandbox-name`, and `openshell.ai/sandbox-namespace`, and bind-mounts a Linux `openshell-sandbox` supervisor binary into the container. + +- **Create**: Pulls or validates the sandbox image according to `sandbox_image_pull_policy`, creates a labeled container, mounts the supervisor binary and optional TLS material, and starts the container with the supervisor as entrypoint. +- **List/Get/Watch**: Reads labeled containers in the configured sandbox namespace and derives driver-native sandbox status from Docker state plus supervisor relay readiness. +- **Stop**: Stops the matching labeled container without deleting it. +- **Delete**: Force-removes the matching labeled container. +- **Gateway shutdown**: On SIGINT or SIGTERM, `run_server()` leaves the accept loop and calls the Docker shutdown cleanup hook. The hook stops all running, restarting, or paused OpenShell-managed containers in the configured sandbox namespace so local sandboxes do not keep running after the gateway exits. +- **Gateway startup resume**: Before the watch and reconcile loops spawn, `ComputeRuntime::resume_persisted_sandboxes()` walks every sandbox record in the store. For each sandbox whose phase is `Provisioning`, `Ready`, or `Unknown`, it asks the Docker driver to start the labeled container if it is in the `exited` or `created` state (`StartupResume::resume_sandbox`). Containers in `running` or `restarting` are left alone; `paused`, `dead`, and `removing` are skipped. If the matching container has disappeared, the sandbox is moved to phase `Error` with reason `BackendResourceMissing`; if the start call fails, the sandbox moves to `Error` with reason `ResumeFailed`. This is what makes sandboxes survive a graceful gateway restart end-to-end: shutdown stops them, the next startup resumes them, and the store remains the source of truth across the cycle. Drivers that do not need this hook (Kubernetes, Podman, VM) leave `startup_resume = None`, which makes the resume sweep a no-op. +- **Handshake secret**: The Docker driver does not inject `OPENSHELL_SSH_HANDSHAKE_SECRET` or `OPENSHELL_SSH_HANDSHAKE_SKEW_SECS` into containers. Supervisor relay auth relies on the gateway connection rather than a Docker-visible container env secret. + ### VM Driver `VmDriver` (`crates/openshell-driver-vm/src/driver.rs`) is served by the standalone `openshell-driver-vm` process. The gateway spawns that binary on demand and talks to it over the internal `openshell.compute.v1.ComputeDriver` gRPC contract via a Unix domain socket. diff --git a/architecture/sandbox-connect.md b/architecture/sandbox-connect.md index 2f5e62b2b..499532fb9 100644 --- a/architecture/sandbox-connect.md +++ b/architecture/sandbox-connect.md @@ -623,7 +623,7 @@ The sandbox SSH daemon's exit thread waits for the reader thread to finish forwa ### Sandbox environment variables -These are injected into compute-backed sandboxes by the **Kubernetes** driver (`crates/openshell-driver-kubernetes/src/driver.rs`) and the **Podman** driver (`crates/openshell-driver-podman/src/container.rs`). Together they are required for **persistent `ConnectSupervisor` registration and relay** (see [Podman and relay environment](#podman-and-relay-environment) for the Podman-specific fix): +These are injected into compute-backed sandboxes by the **Kubernetes** driver (`crates/openshell-driver-kubernetes/src/driver.rs`), the **Podman** driver (`crates/openshell-driver-podman/src/container.rs`), and the **Docker** driver (`crates/openshell-driver-docker/src/lib.rs`). Together they are required for **persistent `ConnectSupervisor` registration and relay** (see [Podman and relay environment](#podman-and-relay-environment) for the Podman-specific fix): | Variable | Description | |---|---| diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index 2fbdb1b1d..40a87fc41 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -48,6 +48,7 @@ pub const CDI_GPU_DEVICE_ALL: &str = "nvidia.com/gpu=all"; pub enum ComputeDriverKind { Kubernetes, Vm, + Docker, Podman, } @@ -57,6 +58,7 @@ impl ComputeDriverKind { match self { Self::Kubernetes => "kubernetes", Self::Vm => "vm", + Self::Docker => "docker", Self::Podman => "podman", } } @@ -75,9 +77,10 @@ impl FromStr for ComputeDriverKind { match value.trim().to_ascii_lowercase().as_str() { "kubernetes" => Ok(Self::Kubernetes), "vm" => Ok(Self::Vm), + "docker" => Ok(Self::Docker), "podman" => Ok(Self::Podman), other => Err(format!( - "unsupported compute driver '{other}'. expected one of: kubernetes, vm, podman" + "unsupported compute driver '{other}'. expected one of: kubernetes, vm, docker, podman" )), } } @@ -450,12 +453,16 @@ mod tests { "podman".parse::().unwrap(), ComputeDriverKind::Podman ); + assert_eq!( + "docker".parse::().unwrap(), + ComputeDriverKind::Docker + ); } #[test] fn compute_driver_kind_rejects_unknown_values() { - let err = "docker".parse::().unwrap_err(); - assert!(err.contains("unsupported compute driver 'docker'")); + let err = "firecracker".parse::().unwrap_err(); + assert!(err.contains("unsupported compute driver 'firecracker'")); } #[test] diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml new file mode 100644 index 000000000..8e2fd1777 --- /dev/null +++ b/crates/openshell-driver-docker/Cargo.toml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-driver-docker" +description = "Docker compute driver for OpenShell" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[dependencies] +openshell-core = { path = "../openshell-core" } + +tokio = { workspace = true } +tonic = { workspace = true } +futures = { workspace = true } +tokio-stream = { workspace = true } +tracing = { workspace = true } +bytes = { workspace = true } +url = { workspace = true } +bollard = { version = "0.20" } +tar = "0.4" +tempfile = "3" + +[lints] +workspace = true diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs new file mode 100644 index 000000000..d79b4bfee --- /dev/null +++ b/crates/openshell-driver-docker/src/lib.rs @@ -0,0 +1,1777 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Docker compute driver. + +#![allow(clippy::result_large_err)] + +use bollard::Docker; +use bollard::errors::Error as BollardError; +use bollard::models::{ + ContainerCreateBody, ContainerSummary, ContainerSummaryStateEnum, HostConfig, Mount, + MountTypeEnum, RestartPolicy, RestartPolicyNameEnum, +}; +use bollard::query_parameters::{ + CreateContainerOptionsBuilder, CreateImageOptions, DownloadFromContainerOptionsBuilder, + ListContainersOptionsBuilder, RemoveContainerOptionsBuilder, StopContainerOptionsBuilder, +}; +use bytes::Bytes; +use futures::{Stream, StreamExt}; +use openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS; +use openshell_core::proto::compute::v1::{ + CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, + DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, + GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, + ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, + ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, + WatchSandboxesEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, + compute_driver_server::ComputeDriver, watch_sandboxes_event, +}; +use openshell_core::{Config, Error, Result as CoreResult}; +use std::collections::HashMap; +use std::io::Read; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::{broadcast, mpsc}; +use tokio_stream::wrappers::ReceiverStream; +use tonic::{Request, Response, Status}; +use tracing::{info, warn}; +use url::{Host, Url}; + +const WATCH_BUFFER: usize = 128; +const WATCH_POLL_INTERVAL: Duration = Duration::from_secs(2); +const WATCH_POLL_MAX_BACKOFF: Duration = Duration::from_secs(30); + +const MANAGED_BY_LABEL_KEY: &str = "openshell.ai/managed-by"; +const MANAGED_BY_LABEL_VALUE: &str = "openshell"; +const SANDBOX_ID_LABEL_KEY: &str = "openshell.ai/sandbox-id"; +const SANDBOX_NAME_LABEL_KEY: &str = "openshell.ai/sandbox-name"; +const SANDBOX_NAMESPACE_LABEL_KEY: &str = "openshell.ai/sandbox-namespace"; + +const SUPERVISOR_MOUNT_PATH: &str = "/opt/openshell/bin/openshell-sandbox"; +const TLS_CA_MOUNT_PATH: &str = "/etc/openshell/tls/client/ca.crt"; +const TLS_CERT_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.crt"; +const TLS_KEY_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.key"; +const SANDBOX_COMMAND: &str = "sleep infinity"; +const HOST_OPENSHELL_INTERNAL: &str = "host.openshell.internal"; +const HOST_DOCKER_INTERNAL: &str = "host.docker.internal"; + +/// Default image holding the Linux `openshell-sandbox` binary. The gateway +/// pulls this image and extracts the binary to a host-side cache when no +/// explicit `--docker-supervisor-bin` override or local build is available. +const DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO: &str = "ghcr.io/nvidia/openshell/supervisor"; + +/// Path to the supervisor binary inside the `openshell/supervisor` image. +const SUPERVISOR_IMAGE_BINARY_PATH: &str = "/usr/local/bin/openshell-sandbox"; + +/// Return the default `ghcr.io/nvidia/openshell/supervisor:` reference +/// used when no supervisor binary override is provided. +pub fn default_docker_supervisor_image() -> String { + format!( + "{DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO}:{}", + default_docker_supervisor_image_tag() + ) +} + +/// Image tag baked in at compile time to pair the gateway with a matching +/// supervisor image. +/// +/// Build pipelines pass `OPENSHELL_IMAGE_TAG` explicitly. The `IMAGE_TAG` +/// fallback covers image build wrappers that already tag the gateway and +/// supervisor together. Standalone release binaries also patch the Cargo +/// package version, so use it when it has been set to a real release value. +fn default_docker_supervisor_image_tag() -> &'static str { + resolve_default_docker_supervisor_image_tag( + option_env!("OPENSHELL_IMAGE_TAG"), + option_env!("IMAGE_TAG"), + env!("CARGO_PKG_VERSION"), + ) +} + +fn resolve_default_docker_supervisor_image_tag( + openshell_image_tag: Option<&'static str>, + image_tag: Option<&'static str>, + cargo_pkg_version: &'static str, +) -> &'static str { + openshell_image_tag + .filter(|tag| !tag.is_empty()) + .or_else(|| image_tag.filter(|tag| !tag.is_empty())) + .unwrap_or_else(|| { + if cargo_pkg_version.is_empty() || cargo_pkg_version == "0.0.0" { + "dev" + } else { + cargo_pkg_version + } + }) +} + +/// Queried by the Docker driver to decide when a sandbox's supervisor +/// relay is live. Implementations return `true` once a sandbox has an +/// active `ConnectSupervisor` session registered. +/// +/// The driver cannot observe the supervisor's SSH socket directly (it +/// lives inside the container), so it leans on this signal to flip the +/// Ready condition from `DependenciesNotReady` to `True`. +pub trait SupervisorReadiness: Send + Sync + 'static { + fn is_supervisor_connected(&self, sandbox_id: &str) -> bool; +} + +/// Gateway-local configuration for the Docker compute driver. +#[derive(Debug, Clone, Default)] +pub struct DockerComputeConfig { + /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. + pub supervisor_bin: Option, + + /// Optional override for the image the gateway pulls to extract the + /// Linux `openshell-sandbox` binary when no explicit binary path or + /// local build is available. Defaults to + /// `ghcr.io/nvidia/openshell/supervisor:`. + pub supervisor_image: Option, + + /// Host-side CA certificate for Docker sandbox mTLS. + pub guest_tls_ca: Option, + + /// Host-side client certificate for Docker sandbox mTLS. + pub guest_tls_cert: Option, + + /// Host-side private key for Docker sandbox mTLS. + pub guest_tls_key: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct DockerGuestTlsPaths { + pub(crate) ca: PathBuf, + pub(crate) cert: PathBuf, + pub(crate) key: PathBuf, +} + +#[derive(Debug, Clone)] +struct DockerDriverRuntimeConfig { + default_image: String, + image_pull_policy: String, + sandbox_namespace: String, + grpc_endpoint: String, + ssh_socket_path: String, + stop_timeout_secs: u32, + log_level: String, + supervisor_bin: PathBuf, + guest_tls: Option, + daemon_version: String, +} + +#[derive(Clone)] +pub struct DockerComputeDriver { + docker: Arc, + config: DockerDriverRuntimeConfig, + events: broadcast::Sender, + supervisor_readiness: Arc, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +struct DockerResourceLimits { + nano_cpus: Option, + memory_bytes: Option, +} + +type WatchStream = + Pin> + Send + 'static>>; + +impl DockerComputeDriver { + pub async fn new( + config: &Config, + docker_config: &DockerComputeConfig, + supervisor_readiness: Arc, + ) -> CoreResult { + if config.grpc_endpoint.trim().is_empty() { + return Err(Error::config( + "grpc_endpoint is required when using the docker compute driver", + )); + } + + let docker = Docker::connect_with_local_defaults() + .map_err(|err| Error::execution(format!("failed to create Docker client: {err}")))?; + let version = docker.version().await.map_err(|err| { + Error::execution(format!("failed to query Docker daemon version: {err}")) + })?; + let daemon_arch = normalize_docker_arch(version.arch.as_deref().unwrap_or_default()); + let supervisor_bin = resolve_supervisor_bin(&docker, docker_config, &daemon_arch).await?; + let guest_tls = docker_guest_tls_paths(config, docker_config)?; + + let driver = Self { + docker: Arc::new(docker), + config: DockerDriverRuntimeConfig { + default_image: config.sandbox_image.clone(), + image_pull_policy: config.sandbox_image_pull_policy.clone(), + sandbox_namespace: config.sandbox_namespace.clone(), + grpc_endpoint: config.grpc_endpoint.clone(), + ssh_socket_path: config.sandbox_ssh_socket_path.clone(), + stop_timeout_secs: DEFAULT_STOP_TIMEOUT_SECS, + log_level: config.log_level.clone(), + supervisor_bin, + guest_tls, + daemon_version: version.version.unwrap_or_else(|| "unknown".to_string()), + }, + events: broadcast::channel(WATCH_BUFFER).0, + supervisor_readiness, + }; + + let poll_driver = driver.clone(); + tokio::spawn(async move { + poll_driver.poll_loop().await; + }); + + Ok(driver) + } + + fn capabilities(&self) -> GetCapabilitiesResponse { + GetCapabilitiesResponse { + driver_name: "docker".to_string(), + driver_version: self.config.daemon_version.clone(), + default_image: self.config.default_image.clone(), + supports_gpu: false, + } + } + + fn validate_sandbox(sandbox: &DriverSandbox) -> Result<(), Status> { + let spec = sandbox + .spec + .as_ref() + .ok_or_else(|| Status::invalid_argument("sandbox.spec is required"))?; + let template = spec + .template + .as_ref() + .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; + + if template.image.trim().is_empty() { + return Err(Status::failed_precondition( + "docker sandboxes require a template image", + )); + } + if spec.gpu { + return Err(Status::failed_precondition( + "docker compute driver does not support gpu sandboxes", + )); + } + if !template.agent_socket_path.trim().is_empty() { + return Err(Status::failed_precondition( + "docker compute driver does not support template.agent_socket_path", + )); + } + if template + .platform_config + .as_ref() + .is_some_and(|config| !config.fields.is_empty()) + { + return Err(Status::failed_precondition( + "docker compute driver does not support template.platform_config", + )); + } + + let _ = docker_resource_limits(template)?; + Ok(()) + } + + async fn get_sandbox_snapshot( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result, Status> { + let container = self + .find_managed_container_summary(sandbox_id, sandbox_name) + .await?; + Ok(container.and_then(|summary| { + sandbox_from_container_summary(&summary, self.supervisor_readiness.as_ref()) + })) + } + + async fn current_snapshots(&self) -> Result, Status> { + let containers = self.list_managed_container_summaries().await?; + let mut sandboxes = containers + .iter() + .filter_map(|summary| { + sandbox_from_container_summary(summary, self.supervisor_readiness.as_ref()) + }) + .collect::>(); + sandboxes.sort_by(|left, right| left.id.cmp(&right.id)); + Ok(sandboxes) + } + + async fn create_sandbox_inner(&self, sandbox: &DriverSandbox) -> Result<(), Status> { + Self::validate_sandbox(sandbox)?; + + if self + .find_managed_container_summary(&sandbox.id, &sandbox.name) + .await? + .is_some() + { + return Err(Status::already_exists("sandbox already exists")); + } + + let template = sandbox + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .expect("validated sandbox has template"); + self.ensure_image_available(&template.image).await?; + + let container_name = container_name_for_sandbox(sandbox); + let create_body = build_container_create_body(sandbox, &self.config)?; + self.docker + .create_container( + Some( + CreateContainerOptionsBuilder::default() + .name(container_name.as_str()) + .build(), + ), + create_body, + ) + .await + .map_err(|err| { + create_status_from_docker_error("create docker sandbox container", err) + })?; + + if let Err(err) = self.docker.start_container(&container_name, None).await { + let cleanup = self + .docker + .remove_container( + &container_name, + Some(RemoveContainerOptionsBuilder::default().force(true).build()), + ) + .await; + if let Err(cleanup_err) = cleanup { + warn!( + sandbox_id = %sandbox.id, + container_name, + error = %cleanup_err, + "Failed to clean up Docker container after start failure" + ); + } + return Err(create_status_from_docker_error( + "start docker sandbox container", + err, + )); + } + + Ok(()) + } + + async fn delete_sandbox_inner( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result { + let Some(container) = self + .find_managed_container_summary(sandbox_id, sandbox_name) + .await? + else { + return Ok(false); + }; + let Some(target) = summary_container_target(&container) else { + return Ok(false); + }; + + match self + .docker + .remove_container( + &target, + Some(RemoveContainerOptionsBuilder::default().force(true).build()), + ) + .await + { + Ok(()) => Ok(true), + Err(err) if is_not_found_error(&err) => Ok(false), + Err(err) => Err(internal_status("delete docker sandbox container", err)), + } + } + + async fn stop_sandbox_inner(&self, sandbox_id: &str, sandbox_name: &str) -> Result<(), Status> { + let Some(container) = self + .find_managed_container_summary(sandbox_id, sandbox_name) + .await? + else { + return Err(Status::not_found("sandbox not found")); + }; + let Some(target) = summary_container_target(&container) else { + return Err(Status::not_found("sandbox container has no id or name")); + }; + + match self + .docker + .stop_container( + &target, + Some( + StopContainerOptionsBuilder::default() + .t(docker_stop_timeout_secs(self.config.stop_timeout_secs)) + .build(), + ), + ) + .await + { + Ok(()) => Ok(()), + Err(err) if is_not_modified_error(&err) => Ok(()), + Err(err) if is_not_found_error(&err) => Err(Status::not_found("sandbox not found")), + Err(err) => Err(internal_status("stop docker sandbox container", err)), + } + } + + /// Start a managed sandbox container that was previously stopped. Used + /// by the gateway to resume sandboxes after a restart so that running + /// state in the gateway store is matched by an actually-running + /// container. + /// + /// Returns `Ok(true)` when a container existed and was started (or was + /// already running), `Ok(false)` when no managed container is found for + /// the sandbox, and `Err(...)` for any Docker failure. + pub async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result { + let Some(container) = self + .find_managed_container_summary(sandbox_id, sandbox_name) + .await? + else { + return Ok(false); + }; + let Some(target) = summary_container_target(&container) else { + return Ok(false); + }; + let state = container.state.unwrap_or(ContainerSummaryStateEnum::EMPTY); + if !container_state_needs_resume(state) { + return Ok(true); + } + + match self.docker.start_container(&target, None).await { + Ok(()) => Ok(true), + // Already running — race with another resume path or the + // restart policy. Treat as success. + Err(err) if is_not_modified_error(&err) => Ok(true), + Err(err) if is_not_found_error(&err) => Ok(false), + Err(err) => Err(internal_status("start docker sandbox container", err)), + } + } + + pub async fn stop_managed_containers_on_shutdown(&self) -> Result { + let containers = self.list_managed_container_summaries().await?; + let targets = containers + .into_iter() + .filter_map(|container| { + let state = container.state.unwrap_or(ContainerSummaryStateEnum::EMPTY); + if container_state_needs_shutdown_stop(state) { + summary_container_target(&container) + } else { + None + } + }) + .collect::>(); + let target_count = targets.len(); + let mut stopped = 0usize; + let mut failures = Vec::new(); + let stop_timeout_secs = self.config.stop_timeout_secs; + + let mut stop_results = futures::stream::iter(targets.into_iter().map(|target| { + let docker = self.docker.clone(); + async move { + let result = docker + .stop_container( + &target, + Some( + StopContainerOptionsBuilder::default() + .t(docker_stop_timeout_secs(stop_timeout_secs)) + .build(), + ), + ) + .await; + (target, result) + } + })) + .buffer_unordered(16); + + while let Some((target, result)) = stop_results.next().await { + match result { + Ok(()) => { + stopped += 1; + } + Err(err) if is_not_found_error(&err) || is_not_modified_error(&err) => {} + Err(err) => { + warn!( + container = %target, + error = %err, + "Failed to stop Docker sandbox container during shutdown" + ); + failures.push(target); + } + } + } + + if !failures.is_empty() { + return Err(Status::internal(format!( + "failed to stop {} of {target_count} Docker sandbox containers during shutdown", + failures.len() + ))); + } + + Ok(stopped) + } + + async fn poll_loop(self) { + let mut previous = match self.current_snapshot_map().await { + Ok(snapshots) => snapshots, + Err(err) => { + warn!(error = %err, "Failed to seed Docker sandbox watch state"); + HashMap::new() + } + }; + + // Exponential backoff on consecutive Docker failures to avoid a 2s + // warn-log flood when the daemon is unreachable for an extended + // period (e.g. restart, socket removed). + let mut backoff = WATCH_POLL_INTERVAL; + loop { + tokio::time::sleep(backoff).await; + match self.current_snapshot_map().await { + Ok(current) => { + emit_snapshot_diff(&self.events, &previous, ¤t); + previous = current; + backoff = WATCH_POLL_INTERVAL; + } + Err(err) => { + warn!( + error = %err, + backoff_secs = backoff.as_secs(), + "Failed to poll Docker sandboxes" + ); + backoff = (backoff * 2).min(WATCH_POLL_MAX_BACKOFF); + } + } + } + } + + async fn current_snapshot_map(&self) -> Result, Status> { + self.current_snapshots().await.map(|snapshots| { + snapshots + .into_iter() + .map(|sandbox| (sandbox.id.clone(), sandbox)) + .collect() + }) + } + + async fn list_managed_container_summaries(&self) -> Result, Status> { + let filters = managed_container_label_filters(&self.config.sandbox_namespace, []); + self.docker + .list_containers(Some( + ListContainersOptionsBuilder::default() + .all(true) + .filters(&filters) + .build(), + )) + .await + .map_err(|err| internal_status("list Docker sandbox containers", err)) + } + + async fn find_managed_container_summary( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result, Status> { + let mut label_filter_values = Vec::new(); + if !sandbox_id.is_empty() { + label_filter_values.push(format!("{SANDBOX_ID_LABEL_KEY}={sandbox_id}")); + } else if !sandbox_name.is_empty() { + label_filter_values.push(format!("{SANDBOX_NAME_LABEL_KEY}={sandbox_name}")); + } + + let filters = + managed_container_label_filters(&self.config.sandbox_namespace, label_filter_values); + let containers = self + .docker + .list_containers(Some( + ListContainersOptionsBuilder::default() + .all(true) + .filters(&filters) + .build(), + )) + .await + .map_err(|err| internal_status("find Docker sandbox container", err))?; + + Ok(containers.into_iter().find(|summary| { + let Some(labels) = summary.labels.as_ref() else { + return false; + }; + let namespace_matches = labels + .get(SANDBOX_NAMESPACE_LABEL_KEY) + .is_some_and(|value| value == &self.config.sandbox_namespace); + let id_matches = sandbox_id.is_empty() + || labels + .get(SANDBOX_ID_LABEL_KEY) + .is_some_and(|value| value == sandbox_id); + let name_matches = sandbox_name.is_empty() + || labels + .get(SANDBOX_NAME_LABEL_KEY) + .is_some_and(|value| value == sandbox_name); + namespace_matches && id_matches && name_matches + })) + } + + async fn ensure_image_available(&self, image: &str) -> Result<(), Status> { + let policy = self.config.image_pull_policy.trim().to_ascii_lowercase(); + match policy.as_str() { + "" | "ifnotpresent" => { + if self.docker.inspect_image(image).await.is_ok() { + return Ok(()); + } + self.pull_image(image).await + } + "always" => self.pull_image(image).await, + "never" => match self.docker.inspect_image(image).await { + Ok(_) => Ok(()), + Err(err) if is_not_found_error(&err) => Err(Status::failed_precondition(format!( + "docker image '{image}' is not present locally and sandbox_image_pull_policy=Never" + ))), + Err(err) => Err(internal_status("inspect Docker image", err)), + }, + other => Err(Status::failed_precondition(format!( + "unsupported docker sandbox_image_pull_policy '{other}'; expected Always, IfNotPresent, or Never", + ))), + } + } + + async fn pull_image(&self, image: &str) -> Result<(), Status> { + let mut stream = self.docker.create_image( + Some(CreateImageOptions { + from_image: Some(image.to_string()), + ..Default::default() + }), + None, + None, + ); + while let Some(result) = stream.next().await { + result.map_err(|err| internal_status("pull Docker image", err))?; + } + Ok(()) + } +} + +#[tonic::async_trait] +impl ComputeDriver for DockerComputeDriver { + type WatchSandboxesStream = WatchStream; + + async fn get_capabilities( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(self.capabilities())) + } + + async fn validate_sandbox_create( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request + .into_inner() + .sandbox + .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; + Self::validate_sandbox(&sandbox)?; + Ok(Response::new(ValidateSandboxCreateResponse {})) + } + + async fn get_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() && request.sandbox_name.is_empty() { + return Err(Status::invalid_argument( + "sandbox_id or sandbox_name is required", + )); + } + + let sandbox = self + .get_sandbox_snapshot(&request.sandbox_id, &request.sandbox_name) + .await? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + + if !request.sandbox_id.is_empty() && request.sandbox_id != sandbox.id { + return Err(Status::failed_precondition( + "sandbox_id did not match the fetched sandbox", + )); + } + + Ok(Response::new(GetSandboxResponse { + sandbox: Some(sandbox), + })) + } + + async fn list_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(ListSandboxesResponse { + sandboxes: self.current_snapshots().await?, + })) + } + + async fn create_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request + .into_inner() + .sandbox + .ok_or_else(|| Status::invalid_argument("sandbox is required"))?; + self.create_sandbox_inner(&sandbox).await?; + Ok(Response::new(CreateSandboxResponse {})) + } + + async fn stop_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() && request.sandbox_name.is_empty() { + return Err(Status::invalid_argument( + "sandbox_id or sandbox_name is required", + )); + } + + self.stop_sandbox_inner(&request.sandbox_id, &request.sandbox_name) + .await?; + Ok(Response::new(StopSandboxResponse {})) + } + + async fn delete_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + Ok(Response::new(DeleteSandboxResponse { + deleted: self + .delete_sandbox_inner(&request.sandbox_id, &request.sandbox_name) + .await?, + })) + } + + async fn watch_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + // Subscribe before taking the initial snapshot so any event emitted + // between the snapshot and this subscriber becoming active is still + // delivered. Downstream consumers treat sandbox events as + // idempotent (keyed by sandbox id), so a duplicate event is benign + // while a missed one leaks state. + let mut rx = self.events.subscribe(); + let initial = self.current_snapshots().await?; + let (tx, out_rx) = mpsc::channel(WATCH_BUFFER); + tokio::spawn(async move { + for sandbox in initial { + if tx + .send(Ok(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Sandbox( + WatchSandboxesSandboxEvent { + sandbox: Some(sandbox), + }, + )), + })) + .await + .is_err() + { + return; + } + } + + loop { + match rx.recv().await { + Ok(event) => { + if tx.send(Ok(event)).await.is_err() { + return; + } + } + Err(broadcast::error::RecvError::Lagged(_)) => {} + Err(broadcast::error::RecvError::Closed) => return, + } + } + }); + + Ok(Response::new(Box::pin(ReceiverStream::new(out_rx)))) + } +} + +fn build_mounts(config: &DockerDriverRuntimeConfig) -> Vec { + let mut mounts = vec![bind_mount( + &config.supervisor_bin, + SUPERVISOR_MOUNT_PATH, + true, + )]; + if let Some(tls) = &config.guest_tls { + mounts.push(bind_mount(&tls.ca, TLS_CA_MOUNT_PATH, true)); + mounts.push(bind_mount(&tls.cert, TLS_CERT_MOUNT_PATH, true)); + mounts.push(bind_mount(&tls.key, TLS_KEY_MOUNT_PATH, true)); + } + mounts +} + +fn bind_mount(source: &Path, target: &str, read_only: bool) -> Mount { + Mount { + target: Some(target.to_string()), + source: Some(source.display().to_string()), + typ: Some(MountTypeEnum::BIND), + read_only: Some(read_only), + ..Default::default() + } +} + +fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig) -> Vec { + let mut environment = HashMap::from([ + ("HOME".to_string(), "/root".to_string()), + ( + "PATH".to_string(), + "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + ), + ("TERM".to_string(), "xterm".to_string()), + ( + "OPENSHELL_LOG_LEVEL".to_string(), + sandbox_log_level(sandbox, &config.log_level), + ), + ]); + + if let Some(spec) = sandbox.spec.as_ref() { + if let Some(template) = spec.template.as_ref() { + environment.extend(template.environment.clone()); + } + environment.extend(spec.environment.clone()); + } + + environment.insert( + "OPENSHELL_ENDPOINT".to_string(), + container_visible_openshell_endpoint(&config.grpc_endpoint), + ); + environment.insert("OPENSHELL_SANDBOX_ID".to_string(), sandbox.id.clone()); + environment.insert("OPENSHELL_SANDBOX".to_string(), sandbox.name.clone()); + environment.insert( + "OPENSHELL_SSH_SOCKET_PATH".to_string(), + config.ssh_socket_path.clone(), + ); + environment.insert( + "OPENSHELL_SANDBOX_COMMAND".to_string(), + SANDBOX_COMMAND.to_string(), + ); + if config.guest_tls.is_some() { + environment.insert( + "OPENSHELL_TLS_CA".to_string(), + TLS_CA_MOUNT_PATH.to_string(), + ); + environment.insert( + "OPENSHELL_TLS_CERT".to_string(), + TLS_CERT_MOUNT_PATH.to_string(), + ); + environment.insert( + "OPENSHELL_TLS_KEY".to_string(), + TLS_KEY_MOUNT_PATH.to_string(), + ); + } + + let mut pairs = environment.into_iter().collect::>(); + pairs.sort_by(|left, right| left.0.cmp(&right.0)); + pairs + .into_iter() + .map(|(key, value)| format!("{key}={value}")) + .collect() +} + +fn build_container_create_body( + sandbox: &DriverSandbox, + config: &DockerDriverRuntimeConfig, +) -> Result { + let spec = sandbox + .spec + .as_ref() + .ok_or_else(|| Status::invalid_argument("sandbox.spec is required"))?; + let template = spec + .template + .as_ref() + .ok_or_else(|| Status::invalid_argument("sandbox.spec.template is required"))?; + let resource_limits = docker_resource_limits(template)?; + let mut labels = template.labels.clone(); + labels.insert( + MANAGED_BY_LABEL_KEY.to_string(), + MANAGED_BY_LABEL_VALUE.to_string(), + ); + labels.insert(SANDBOX_ID_LABEL_KEY.to_string(), sandbox.id.clone()); + labels.insert(SANDBOX_NAME_LABEL_KEY.to_string(), sandbox.name.clone()); + labels.insert( + SANDBOX_NAMESPACE_LABEL_KEY.to_string(), + sandbox.namespace.clone(), + ); + + Ok(ContainerCreateBody { + image: Some(template.image.clone()), + user: Some("0".to_string()), + env: Some(build_environment(sandbox, config)), + entrypoint: Some(vec![SUPERVISOR_MOUNT_PATH.to_string()]), + // Clear the image CMD so Docker does not append inherited args to the + // supervisor entrypoint. + cmd: Some(Vec::new()), + labels: Some(labels), + host_config: Some(HostConfig { + nano_cpus: resource_limits.nano_cpus, + memory: resource_limits.memory_bytes, + mounts: Some(build_mounts(config)), + restart_policy: Some(RestartPolicy { + name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), + maximum_retry_count: None, + }), + cap_add: Some(vec![ + "SYS_ADMIN".to_string(), + "NET_ADMIN".to_string(), + "SYS_PTRACE".to_string(), + "SYSLOG".to_string(), + ]), + extra_hosts: Some(vec![ + format!("{HOST_DOCKER_INTERNAL}:host-gateway"), + format!("{HOST_OPENSHELL_INTERNAL}:host-gateway"), + ]), + ..Default::default() + }), + ..Default::default() + }) +} + +fn sandbox_log_level(sandbox: &DriverSandbox, default_level: &str) -> String { + sandbox + .spec + .as_ref() + .map(|spec| spec.log_level.as_str()) + .filter(|level| !level.is_empty()) + .unwrap_or(default_level) + .to_string() +} + +fn container_visible_openshell_endpoint(endpoint: &str) -> String { + let Ok(mut url) = Url::parse(endpoint) else { + return endpoint.to_string(); + }; + + let should_rewrite = match url.host() { + Some(Host::Ipv4(ip)) => ip.is_loopback() || ip.is_unspecified(), + Some(Host::Ipv6(ip)) => ip.is_loopback() || ip.is_unspecified(), + Some(Host::Domain(host)) => host.eq_ignore_ascii_case("localhost"), + None => false, + }; + + if should_rewrite && url.set_host(Some(HOST_OPENSHELL_INTERNAL)).is_ok() { + return url.to_string(); + } + + endpoint.to_string() +} + +fn docker_resource_limits( + template: &DriverSandboxTemplate, +) -> Result { + let Some(resources) = template.resources.as_ref() else { + return Ok(DockerResourceLimits::default()); + }; + + if !resources.cpu_request.trim().is_empty() { + return Err(Status::failed_precondition( + "docker compute driver does not support resources.requests.cpu", + )); + } + if !resources.memory_request.trim().is_empty() { + return Err(Status::failed_precondition( + "docker compute driver does not support resources.requests.memory", + )); + } + + Ok(DockerResourceLimits { + nano_cpus: parse_cpu_limit(&resources.cpu_limit)?, + memory_bytes: parse_memory_limit(&resources.memory_limit)?, + }) +} + +#[allow(clippy::cast_possible_truncation)] +fn parse_cpu_limit(value: &str) -> Result, Status> { + let value = value.trim(); + if value.is_empty() { + return Ok(None); + } + if let Some(millicores) = value.strip_suffix('m') { + let millicores = millicores.parse::().map_err(|_| { + Status::failed_precondition(format!( + "invalid docker cpu_limit '{value}'; expected an integer or millicore quantity", + )) + })?; + if millicores <= 0 { + return Err(Status::failed_precondition( + "docker cpu_limit must be greater than zero", + )); + } + return Ok(Some(millicores.saturating_mul(1_000_000))); + } + + let cores = value.parse::().map_err(|_| { + Status::failed_precondition(format!( + "invalid docker cpu_limit '{value}'; expected an integer or millicore quantity", + )) + })?; + if !cores.is_finite() || cores <= 0.0 { + return Err(Status::failed_precondition( + "docker cpu_limit must be greater than zero", + )); + } + + Ok(Some((cores * 1_000_000_000.0).round() as i64)) +} + +#[allow(clippy::cast_possible_truncation)] +fn parse_memory_limit(value: &str) -> Result, Status> { + let value = value.trim(); + if value.is_empty() { + return Ok(None); + } + + let number_end = value + .find(|ch: char| !(ch.is_ascii_digit() || ch == '.')) + .unwrap_or(value.len()); + let (number, suffix) = value.split_at(number_end); + let amount = number.parse::().map_err(|_| { + Status::failed_precondition(format!( + "invalid docker memory_limit '{value}'; expected a Kubernetes-style quantity", + )) + })?; + if !amount.is_finite() || amount <= 0.0 { + return Err(Status::failed_precondition( + "docker memory_limit must be greater than zero", + )); + } + + let multiplier = match suffix { + "" => 1_f64, + "Ki" => 1024_f64, + "Mi" => 1024_f64.powi(2), + "Gi" => 1024_f64.powi(3), + "Ti" => 1024_f64.powi(4), + "Pi" => 1024_f64.powi(5), + "Ei" => 1024_f64.powi(6), + "K" => 1000_f64, + "M" => 1000_f64.powi(2), + "G" => 1000_f64.powi(3), + "T" => 1000_f64.powi(4), + "P" => 1000_f64.powi(5), + "E" => 1000_f64.powi(6), + _ => { + return Err(Status::failed_precondition(format!( + "invalid docker memory_limit suffix '{suffix}'", + ))); + } + }; + + Ok(Some((amount * multiplier).round() as i64)) +} + +fn sandbox_from_container_summary( + summary: &ContainerSummary, + readiness: &dyn SupervisorReadiness, +) -> Option { + let labels = summary.labels.as_ref()?; + let id = labels.get(SANDBOX_ID_LABEL_KEY)?.clone(); + let name = labels.get(SANDBOX_NAME_LABEL_KEY)?.clone(); + let namespace = labels + .get(SANDBOX_NAMESPACE_LABEL_KEY) + .cloned() + .unwrap_or_default(); + + let supervisor_connected = readiness.is_supervisor_connected(&id); + Some(DriverSandbox { + id, + name: name.clone(), + namespace, + spec: None, + status: Some(driver_status_from_summary( + summary, + &name, + supervisor_connected, + )), + }) +} + +fn driver_status_from_summary( + summary: &ContainerSummary, + sandbox_name: &str, + supervisor_connected: bool, +) -> DriverSandboxStatus { + let state = summary.state.unwrap_or(ContainerSummaryStateEnum::EMPTY); + let (ready, reason, message, deleting) = container_ready_condition(state, supervisor_connected); + + DriverSandboxStatus { + sandbox_name: summary_container_name(summary).unwrap_or_else(|| sandbox_name.to_string()), + instance_id: summary.id.clone().unwrap_or_default(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![DriverCondition { + r#type: "Ready".to_string(), + status: ready.to_string(), + reason: reason.to_string(), + message: message.to_string(), + last_transition_time: String::new(), + }], + deleting, + } +} + +fn container_ready_condition( + state: ContainerSummaryStateEnum, + supervisor_connected: bool, +) -> (&'static str, &'static str, &'static str, bool) { + match state { + ContainerSummaryStateEnum::RUNNING => { + if supervisor_connected { + ( + "True", + "SupervisorConnected", + "Supervisor relay is live", + false, + ) + } else { + ( + "False", + "DependenciesNotReady", + "Container is running; waiting for supervisor relay", + false, + ) + } + } + ContainerSummaryStateEnum::CREATED => ("False", "Starting", "Container created", false), + ContainerSummaryStateEnum::RESTARTING => ( + "False", + "ContainerRestarting", + "Container is restarting after a failure", + false, + ), + ContainerSummaryStateEnum::EMPTY => { + ("False", "Starting", "Container state is unknown", false) + } + ContainerSummaryStateEnum::REMOVING => { + ("False", "Deleting", "Container is being removed", true) + } + ContainerSummaryStateEnum::PAUSED => { + ("False", "ContainerPaused", "Container is paused", false) + } + ContainerSummaryStateEnum::EXITED => { + ("False", "ContainerExited", "Container exited", false) + } + ContainerSummaryStateEnum::DEAD => ("False", "ContainerDead", "Container is dead", false), + } +} + +fn summary_container_name(summary: &ContainerSummary) -> Option { + summary + .names + .as_ref() + .and_then(|names| names.first()) + .map(|name| name.trim_start_matches('/').to_string()) + .filter(|name| !name.is_empty()) +} + +fn summary_container_target(summary: &ContainerSummary) -> Option { + // Prefer the container ID: it's stable while the container exists and is + // accepted by Docker APIs just like a name. Fall back to the parsed name + // for transient summaries that do not include an ID. + summary + .id + .as_deref() + .filter(|id| !id.is_empty()) + .map(str::to_string) + .or_else(|| summary_container_name(summary)) +} + +fn container_state_needs_shutdown_stop(state: ContainerSummaryStateEnum) -> bool { + matches!( + state, + ContainerSummaryStateEnum::RUNNING + | ContainerSummaryStateEnum::RESTARTING + | ContainerSummaryStateEnum::PAUSED + ) +} + +/// States from which a managed container can be brought back to running by +/// `start_container`. Skip `Restarting` (already coming up), `Removing`, +/// `Dead` (terminal), `Paused` (needs `unpause`, not `start`), and +/// `Running` (nothing to do). +fn container_state_needs_resume(state: ContainerSummaryStateEnum) -> bool { + matches!( + state, + ContainerSummaryStateEnum::EXITED | ContainerSummaryStateEnum::CREATED + ) +} + +fn docker_stop_timeout_secs(timeout_secs: u32) -> i32 { + i32::try_from(timeout_secs).unwrap_or(i32::MAX) +} + +fn emit_snapshot_diff( + events: &broadcast::Sender, + previous: &HashMap, + current: &HashMap, +) { + for (sandbox_id, sandbox) in current { + if previous.get(sandbox_id) == Some(sandbox) { + continue; + } + let _ = events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Sandbox( + WatchSandboxesSandboxEvent { + sandbox: Some(sandbox.clone()), + }, + )), + }); + } + + for sandbox_id in previous.keys() { + if current.contains_key(sandbox_id) { + continue; + } + let _ = events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Deleted( + WatchSandboxesDeletedEvent { + sandbox_id: sandbox_id.clone(), + }, + )), + }); + } +} + +fn label_filters(values: impl IntoIterator) -> HashMap> { + HashMap::from([("label".to_string(), values.into_iter().collect())]) +} + +fn managed_container_label_filters( + sandbox_namespace: &str, + extra_values: impl IntoIterator, +) -> HashMap> { + let mut values = vec![ + format!("{MANAGED_BY_LABEL_KEY}={MANAGED_BY_LABEL_VALUE}"), + format!("{SANDBOX_NAMESPACE_LABEL_KEY}={sandbox_namespace}"), + ]; + values.extend(extra_values); + label_filters(values) +} + +/// Maximum Docker container name length. Docker's own limit is 253 bytes, but +/// we cap at a conservative 200 to leave headroom for tooling that truncates +/// names further. +const MAX_CONTAINER_NAME_LEN: usize = 200; +const CONTAINER_NAME_PREFIX: &str = "openshell-"; + +fn container_name_for_sandbox(sandbox: &DriverSandbox) -> String { + let id_suffix = sanitize_docker_name(&sandbox.id); + let name = sanitize_docker_name(&sandbox.name); + if name.is_empty() { + let mut base = format!("{CONTAINER_NAME_PREFIX}{id_suffix}"); + // The prefix is always < MAX_CONTAINER_NAME_LEN. Truncate the id + // suffix only if the sandbox id itself is pathologically long. + if base.len() > MAX_CONTAINER_NAME_LEN { + base.truncate(MAX_CONTAINER_NAME_LEN); + } + return base; + } + + // Reserve space for the prefix and the `-` tail so the id + // suffix — which is what makes the name unique between sandboxes that + // share a human-readable prefix — is never truncated away. + let reserved = CONTAINER_NAME_PREFIX.len() + 1 + id_suffix.len(); + if reserved >= MAX_CONTAINER_NAME_LEN { + // Pathological sandbox id. Fall back to `` and truncate. + let mut base = format!("{CONTAINER_NAME_PREFIX}{id_suffix}"); + base.truncate(MAX_CONTAINER_NAME_LEN); + return trim_container_name_tail(base); + } + + let name_budget = MAX_CONTAINER_NAME_LEN - reserved; + let truncated_name = if name.len() > name_budget { + trim_container_name_tail(name[..name_budget].to_string()) + } else { + name + }; + format!("{CONTAINER_NAME_PREFIX}{truncated_name}-{id_suffix}") +} + +/// Docker container names may not end with `-`, `.`, or `_`. Truncation can +/// leave one of those trailing, so strip them before returning. +fn trim_container_name_tail(mut value: String) -> String { + while value + .chars() + .last() + .is_some_and(|ch| matches!(ch, '-' | '.' | '_')) + { + value.pop(); + } + value +} + +fn sanitize_docker_name(value: &str) -> String { + value + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || matches!(ch, '_' | '.' | '-') { + ch.to_ascii_lowercase() + } else { + '-' + } + }) + .collect::() + .trim_matches('-') + .to_string() +} + +fn normalize_docker_arch(arch: &str) -> String { + match arch { + "x86_64" => "amd64".to_string(), + "aarch64" => "arm64".to_string(), + other => other.to_ascii_lowercase(), + } +} + +pub(crate) async fn resolve_supervisor_bin( + docker: &Docker, + docker_config: &DockerComputeConfig, + daemon_arch: &str, +) -> CoreResult { + // Tier 1: explicit --docker-supervisor-bin / OPENSHELL_DOCKER_SUPERVISOR_BIN. + if let Some(path) = docker_config.supervisor_bin.clone() { + let path = canonicalize_existing_file(&path, "docker supervisor binary")?; + validate_linux_elf_binary(&path)?; + return Ok(path); + } + + // Tier 2: sibling `openshell-sandbox` next to the running gateway + // (release artifact layout). Linux-only because the sibling must be a + // Linux ELF to bind-mount into a Linux container. + if cfg!(target_os = "linux") { + let current_exe = std::env::current_exe() + .map_err(|err| Error::config(format!("failed to resolve current executable: {err}")))?; + if let Some(parent) = current_exe.parent() { + let sibling = parent.join("openshell-sandbox"); + if sibling.is_file() { + let path = canonicalize_existing_file(&sibling, "docker supervisor binary")?; + if validate_linux_elf_binary(&path).is_ok() { + return Ok(path); + } + } + } + } + + // Tier 3: local cargo target build (developer workflow). Preferred + // over a registry pull when available because it matches whatever the + // developer just built. + let target_candidates = linux_supervisor_candidates(daemon_arch); + for candidate in &target_candidates { + if candidate.is_file() { + let path = canonicalize_existing_file(candidate, "docker supervisor binary")?; + if validate_linux_elf_binary(&path).is_ok() { + return Ok(path); + } + } + } + + // Tier 4: pull the supervisor image from a registry and extract the + // binary to a host-side cache keyed by image content digest. This is + // the default path for released gateway binaries. + let image = docker_config + .supervisor_image + .clone() + .unwrap_or_else(default_docker_supervisor_image); + extract_supervisor_bin_from_image(docker, &image).await +} + +fn linux_supervisor_candidates(daemon_arch: &str) -> Vec { + match daemon_arch { + "arm64" => vec![PathBuf::from( + "target/aarch64-unknown-linux-gnu/release/openshell-sandbox", + )], + "amd64" => vec![PathBuf::from( + "target/x86_64-unknown-linux-gnu/release/openshell-sandbox", + )], + _ => Vec::new(), + } +} + +/// Pull the supervisor image (if not already local), extract +/// `/usr/local/bin/openshell-sandbox` to a host cache keyed by the image's +/// content digest, and return the cache path. +/// +/// The extraction is atomic: the binary is written to a sibling temp file +/// inside the digest-keyed directory and renamed into place, so concurrent +/// gateway starts don't observe a partial file. +async fn extract_supervisor_bin_from_image(docker: &Docker, image: &str) -> CoreResult { + // Inspect first to see if the image is already present; only pull on miss. + let inspect = match docker.inspect_image(image).await { + Ok(inspect) => inspect, + Err(err) if is_not_found_error(&err) => { + info!(image = image, "Pulling docker supervisor image"); + pull_supervisor_image(docker, image).await?; + docker.inspect_image(image).await.map_err(|err| { + Error::config(format!( + "failed to inspect docker supervisor image '{image}' after pull: {err}", + )) + })? + } + Err(err) => { + return Err(Error::config(format!( + "failed to inspect docker supervisor image '{image}': {err}", + ))); + } + }; + + let digest = inspect.id.clone().ok_or_else(|| { + Error::config(format!( + "docker supervisor image '{image}' inspect response has no Id", + )) + })?; + + let cache_path = supervisor_cache_path(&digest)?; + if cache_path.is_file() { + validate_linux_elf_binary(&cache_path)?; + return Ok(cache_path); + } + + let cache_dir = cache_path.parent().ok_or_else(|| { + Error::config(format!( + "docker supervisor cache path '{}' has no parent directory", + cache_path.display(), + )) + })?; + std::fs::create_dir_all(cache_dir).map_err(|err| { + Error::config(format!( + "failed to create docker supervisor cache dir '{}': {err}", + cache_dir.display(), + )) + })?; + + info!( + image = image, + digest = digest, + cache_path = %cache_path.display(), + "Extracting supervisor binary from image to host cache", + ); + + let binary_bytes = extract_supervisor_binary_bytes(docker, image).await?; + write_cache_binary_atomic(&cache_path, &binary_bytes)?; + validate_linux_elf_binary(&cache_path)?; + Ok(cache_path) +} + +async fn pull_supervisor_image(docker: &Docker, image: &str) -> CoreResult<()> { + let mut stream = docker.create_image( + Some(CreateImageOptions { + from_image: Some(image.to_string()), + ..Default::default() + }), + None, + None, + ); + while let Some(result) = stream.next().await { + result.map_err(|err| { + Error::config(format!( + "failed to pull docker supervisor image '{image}': {err}", + )) + })?; + } + Ok(()) +} + +/// Create a short-lived container from `image`, stream out the supervisor +/// binary as a tar archive, and return the untarred file bytes. The +/// container is always removed, even on error paths. +async fn extract_supervisor_binary_bytes(docker: &Docker, image: &str) -> CoreResult> { + let container_name = temp_extract_container_name(); + docker + .create_container( + Some( + CreateContainerOptionsBuilder::default() + .name(container_name.as_str()) + .build(), + ), + ContainerCreateBody { + image: Some(image.to_string()), + entrypoint: Some(vec!["/bin/true".to_string()]), + cmd: Some(Vec::new()), + ..Default::default() + }, + ) + .await + .map_err(|err| { + Error::config(format!( + "failed to create extractor container from '{image}': {err}", + )) + })?; + + // Always tear down the extractor container, even if extraction fails. + let result = download_binary_from_container(docker, &container_name).await; + if let Err(remove_err) = docker + .remove_container( + &container_name, + Some(RemoveContainerOptionsBuilder::default().force(true).build()), + ) + .await + { + warn!( + container = container_name, + error = %remove_err, + "Failed to remove supervisor extractor container", + ); + } + result +} + +async fn download_binary_from_container( + docker: &Docker, + container_name: &str, +) -> CoreResult> { + let options = DownloadFromContainerOptionsBuilder::default() + .path(SUPERVISOR_IMAGE_BINARY_PATH) + .build(); + let mut stream = docker.download_from_container(container_name, Some(options)); + + let mut tar_bytes = Vec::new(); + while let Some(chunk) = stream.next().await { + let chunk: Bytes = chunk.map_err(|err| { + Error::config(format!( + "failed to read supervisor binary stream from '{container_name}': {err}", + )) + })?; + tar_bytes.extend_from_slice(&chunk); + } + + extract_first_tar_entry(&tar_bytes).map_err(|err| { + Error::config(format!( + "failed to extract supervisor binary from tar archive returned by '{container_name}': {err}", + )) + }) +} + +/// Extract the payload of the first regular-file entry in a tar archive. +/// Docker's `/containers//archive` endpoint returns a single-file tar +/// when `path` points to a file, so we only need the first entry. +fn extract_first_tar_entry(tar_bytes: &[u8]) -> Result, String> { + let mut archive = tar::Archive::new(std::io::Cursor::new(tar_bytes)); + let mut entries = archive + .entries() + .map_err(|err| format!("open tar archive: {err}"))?; + let mut entry = entries + .next() + .ok_or_else(|| "tar archive was empty".to_string())? + .map_err(|err| format!("read tar entry: {err}"))?; + let mut bytes = Vec::new(); + entry + .read_to_end(&mut bytes) + .map_err(|err| format!("read tar entry payload: {err}"))?; + Ok(bytes) +} + +fn write_cache_binary_atomic(final_path: &Path, bytes: &[u8]) -> CoreResult<()> { + let dir = final_path.parent().ok_or_else(|| { + Error::config(format!( + "docker supervisor cache path '{}' has no parent directory", + final_path.display(), + )) + })?; + let mut temp = tempfile::Builder::new() + .prefix(".openshell-sandbox-") + .tempfile_in(dir) + .map_err(|err| { + Error::config(format!( + "failed to create temp file for supervisor binary in '{}': {err}", + dir.display(), + )) + })?; + std::io::Write::write_all(&mut temp, bytes).map_err(|err| { + Error::config(format!( + "failed to write supervisor binary to temp file: {err}", + )) + })?; + temp.as_file().sync_all().map_err(|err| { + Error::config(format!("failed to sync supervisor binary temp file: {err}",)) + })?; + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(temp.path(), std::fs::Permissions::from_mode(0o755)).map_err( + |err| { + Error::config(format!( + "failed to chmod supervisor binary temp file: {err}", + )) + }, + )?; + } + + temp.persist(final_path).map_err(|err| { + Error::config(format!( + "failed to rename supervisor binary into '{}': {}", + final_path.display(), + err.error, + )) + })?; + Ok(()) +} + +/// Cache path for an extracted supervisor binary, keyed by the image's +/// content-addressable digest (e.g. `sha256:abc123…`). The digest-prefixed +/// directory keeps stale extractions from earlier releases isolated so they +/// can be GC'd without affecting the active binary. +fn supervisor_cache_path(digest: &str) -> CoreResult { + let base = openshell_core::paths::xdg_data_dir() + .map_err(|err| Error::config(format!("failed to resolve XDG data dir: {err}")))?; + Ok(supervisor_cache_path_with_base(&base, digest)) +} + +fn supervisor_cache_path_with_base(base: &Path, digest: &str) -> PathBuf { + let sanitized = digest.replace(':', "-"); + base.join("openshell") + .join("docker-supervisor") + .join(sanitized) + .join("openshell-sandbox") +} + +fn temp_extract_container_name() -> String { + use std::sync::atomic::{AtomicU64, Ordering}; + static SEQ: AtomicU64 = AtomicU64::new(0); + let pid = std::process::id(); + let seq = SEQ.fetch_add(1, Ordering::Relaxed); + format!("openshell-supervisor-extract-{pid}-{seq}") +} + +fn canonicalize_existing_file(path: &Path, description: &str) -> CoreResult { + if !path.is_file() { + return Err(Error::config(format!( + "{description} '{}' does not exist or is not a file", + path.display() + ))); + } + std::fs::canonicalize(path).map_err(|err| { + Error::config(format!( + "failed to resolve {description} '{}': {err}", + path.display() + )) + }) +} + +pub(crate) fn validate_linux_elf_binary(path: &Path) -> CoreResult<()> { + let mut file = std::fs::File::open(path).map_err(|err| { + Error::config(format!( + "failed to open docker supervisor binary '{}': {err}", + path.display() + )) + })?; + let mut magic = [0_u8; 4]; + file.read_exact(&mut magic).map_err(|err| { + Error::config(format!( + "failed to read docker supervisor binary '{}': {err}", + path.display() + )) + })?; + if magic != [0x7f, b'E', b'L', b'F'] { + return Err(Error::config(format!( + "docker supervisor binary '{}' must be a Linux ELF executable", + path.display() + ))); + } + Ok(()) +} + +pub(crate) fn docker_guest_tls_paths( + config: &Config, + docker_config: &DockerComputeConfig, +) -> CoreResult> { + let tls_flags_provided = docker_config.guest_tls_ca.is_some() + || docker_config.guest_tls_cert.is_some() + || docker_config.guest_tls_key.is_some(); + + if !config.grpc_endpoint.starts_with("https://") { + if tls_flags_provided { + return Err(Error::config(format!( + "--docker-tls-ca/--docker-tls-cert/--docker-tls-key were provided but OPENSHELL_GRPC_ENDPOINT is '{}'; TLS materials require an https:// endpoint", + config.grpc_endpoint, + ))); + } + return Ok(None); + } + + let provided = [ + docker_config.guest_tls_ca.as_ref(), + docker_config.guest_tls_cert.as_ref(), + docker_config.guest_tls_key.as_ref(), + ]; + if provided.iter().all(Option::is_none) { + return Err(Error::config( + "docker compute driver requires --docker-tls-ca, --docker-tls-cert, and --docker-tls-key when OPENSHELL_GRPC_ENDPOINT uses https://", + )); + } + + let Some(ca) = docker_config.guest_tls_ca.clone() else { + return Err(Error::config( + "--docker-tls-ca is required when Docker sandbox TLS materials are configured", + )); + }; + let Some(cert) = docker_config.guest_tls_cert.clone() else { + return Err(Error::config( + "--docker-tls-cert is required when Docker sandbox TLS materials are configured", + )); + }; + let Some(key) = docker_config.guest_tls_key.clone() else { + return Err(Error::config( + "--docker-tls-key is required when Docker sandbox TLS materials are configured", + )); + }; + + Ok(Some(DockerGuestTlsPaths { + ca: canonicalize_existing_file(&ca, "docker TLS CA certificate")?, + cert: canonicalize_existing_file(&cert, "docker TLS client certificate")?, + key: canonicalize_existing_file(&key, "docker TLS client private key")?, + })) +} + +fn is_not_found_error(err: &BollardError) -> bool { + matches!( + err, + BollardError::DockerResponseServerError { + status_code: 404, + .. + } + ) +} + +fn is_not_modified_error(err: &BollardError) -> bool { + matches!( + err, + BollardError::DockerResponseServerError { + status_code: 304, + .. + } + ) +} + +fn create_status_from_docker_error(operation: &str, err: BollardError) -> Status { + if matches!( + err, + BollardError::DockerResponseServerError { + status_code: 409, + .. + } + ) { + Status::already_exists("sandbox already exists") + } else { + internal_status(operation, err) + } +} + +fn internal_status(operation: &str, err: BollardError) -> Status { + Status::internal(format!("{operation} failed: {err}")) +} + +#[cfg(test)] +mod tests; diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs new file mode 100644 index 000000000..5b323e6e8 --- /dev/null +++ b/crates/openshell-driver-docker/src/tests.rs @@ -0,0 +1,525 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::*; +use openshell_core::proto::compute::v1::{ + DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate, +}; +use std::fs; +use tempfile::TempDir; + +const TLS_MOUNT_DIR: &str = "/etc/openshell/tls/client"; + +fn test_sandbox() -> DriverSandbox { + DriverSandbox { + id: "sbx-123".to_string(), + name: "demo".to_string(), + namespace: "default".to_string(), + spec: Some(DriverSandboxSpec { + log_level: "debug".to_string(), + environment: HashMap::from([("SPEC_ENV".to_string(), "spec".to_string())]), + template: Some(DriverSandboxTemplate { + image: "ghcr.io/nvidia/openshell/sandbox:dev".to_string(), + agent_socket_path: String::new(), + labels: HashMap::new(), + environment: HashMap::from([("TEMPLATE_ENV".to_string(), "template".to_string())]), + resources: None, + platform_config: None, + }), + gpu: false, + }), + status: None, + } +} + +fn runtime_config() -> DockerDriverRuntimeConfig { + DockerDriverRuntimeConfig { + default_image: "image:latest".to_string(), + image_pull_policy: String::new(), + sandbox_namespace: "default".to_string(), + grpc_endpoint: "https://localhost:8443".to_string(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + stop_timeout_secs: DEFAULT_STOP_TIMEOUT_SECS, + log_level: "info".to_string(), + supervisor_bin: PathBuf::from("/tmp/openshell-sandbox"), + guest_tls: Some(DockerGuestTlsPaths { + ca: PathBuf::from("/tmp/ca.crt"), + cert: PathBuf::from("/tmp/tls.crt"), + key: PathBuf::from("/tmp/tls.key"), + }), + daemon_version: "28.0.0".to_string(), + } +} + +#[test] +fn container_visible_endpoint_rewrites_loopback_hosts() { + assert_eq!( + container_visible_openshell_endpoint("https://localhost:8443"), + "https://host.openshell.internal:8443/" + ); + assert_eq!( + container_visible_openshell_endpoint("http://127.0.0.1:8080"), + "http://host.openshell.internal:8080/" + ); + assert_eq!( + container_visible_openshell_endpoint("https://gateway.internal:8443"), + "https://gateway.internal:8443" + ); +} + +#[test] +fn parse_cpu_limit_supports_cores_and_millicores() { + assert_eq!(parse_cpu_limit("250m").unwrap(), Some(250_000_000)); + assert_eq!(parse_cpu_limit("2").unwrap(), Some(2_000_000_000)); + assert!(parse_cpu_limit("0").is_err()); +} + +#[test] +fn parse_memory_limit_supports_binary_quantities() { + assert_eq!(parse_memory_limit("512Mi").unwrap(), Some(536_870_912)); + assert_eq!(parse_memory_limit("1G").unwrap(), Some(1_000_000_000)); + assert!(parse_memory_limit("12XB").is_err()); +} + +#[test] +fn docker_resource_limits_rejects_requests() { + let template = DriverSandboxTemplate { + image: "img".to_string(), + agent_socket_path: String::new(), + labels: HashMap::new(), + environment: HashMap::new(), + resources: Some(DriverResourceRequirements { + cpu_request: "250m".to_string(), + cpu_limit: String::new(), + memory_request: String::new(), + memory_limit: String::new(), + }), + platform_config: None, + }; + + let err = docker_resource_limits(&template).unwrap_err(); + assert_eq!(err.code(), tonic::Code::FailedPrecondition); + assert!(err.message().contains("resources.requests.cpu")); +} + +#[test] +fn build_environment_sets_docker_tls_paths() { + let env = build_environment(&test_sandbox(), &runtime_config()); + assert!(env.contains(&format!("OPENSHELL_TLS_CA={TLS_CA_MOUNT_PATH}"))); + assert!(env.contains(&format!("OPENSHELL_TLS_CERT={TLS_CERT_MOUNT_PATH}"))); + assert!(env.contains(&format!("OPENSHELL_TLS_KEY={TLS_KEY_MOUNT_PATH}"))); + assert!(env.contains(&"TEMPLATE_ENV=template".to_string())); + assert!(env.contains(&"SPEC_ENV=spec".to_string())); + assert!(env.contains(&"OPENSHELL_SANDBOX_COMMAND=sleep infinity".to_string())); + assert!( + !env.iter() + .any(|entry| entry.starts_with("OPENSHELL_SSH_HANDSHAKE_SECRET=")) + ); + assert!( + !env.iter() + .any(|entry| entry.starts_with("OPENSHELL_SSH_HANDSHAKE_SKEW_SECS=")) + ); +} + +#[test] +fn build_mounts_uses_docker_tls_directory() { + let mounts = build_mounts(&runtime_config()); + let targets = mounts + .iter() + .filter_map(|mount| mount.target.clone()) + .collect::>(); + assert!(targets.contains(&SUPERVISOR_MOUNT_PATH.to_string())); + assert!(targets.contains(&TLS_CA_MOUNT_PATH.to_string())); + assert!(targets.contains(&TLS_CERT_MOUNT_PATH.to_string())); + assert!(targets.contains(&TLS_KEY_MOUNT_PATH.to_string())); + assert!( + targets + .iter() + .all(|target| target.starts_with(TLS_MOUNT_DIR) || target == SUPERVISOR_MOUNT_PATH) + ); +} + +#[test] +fn managed_container_label_filters_include_gateway_namespace() { + let filters = + managed_container_label_filters("tenant-a", [format!("{SANDBOX_ID_LABEL_KEY}=sbx-123")]); + let labels = filters.get("label").unwrap(); + + assert!(labels.contains(&format!("{MANAGED_BY_LABEL_KEY}={MANAGED_BY_LABEL_VALUE}"))); + assert!(labels.contains(&format!("{SANDBOX_NAMESPACE_LABEL_KEY}=tenant-a"))); + assert!(labels.contains(&format!("{SANDBOX_ID_LABEL_KEY}=sbx-123"))); +} + +#[test] +fn build_container_create_body_clears_inherited_cmd() { + let create_body = build_container_create_body(&test_sandbox(), &runtime_config()).unwrap(); + + assert_eq!( + create_body.entrypoint, + Some(vec![SUPERVISOR_MOUNT_PATH.to_string()]) + ); + assert_eq!(create_body.cmd, Some(Vec::new())); + assert_eq!( + create_body + .labels + .as_ref() + .and_then(|labels| labels.get(SANDBOX_NAMESPACE_LABEL_KEY)), + Some(&"default".to_string()) + ); +} + +#[test] +fn driver_status_keeps_running_sandboxes_provisioning_with_stable_message() { + let running = ContainerSummary { + id: Some("cid".to_string()), + names: Some(vec!["/openshell-demo".to_string()]), + labels: Some(HashMap::from([ + (SANDBOX_ID_LABEL_KEY.to_string(), "sbx-1".to_string()), + (SANDBOX_NAME_LABEL_KEY.to_string(), "demo".to_string()), + ( + SANDBOX_NAMESPACE_LABEL_KEY.to_string(), + "default".to_string(), + ), + ])), + state: Some(ContainerSummaryStateEnum::RUNNING), + status: Some("Up 2 seconds".to_string()), + ..Default::default() + }; + let exited = ContainerSummary { + state: Some(ContainerSummaryStateEnum::EXITED), + status: Some("Exited (1) 3 seconds ago".to_string()), + ..running.clone() + }; + let running_later = ContainerSummary { + status: Some("Up 4 seconds".to_string()), + ..running.clone() + }; + + let running_status = driver_status_from_summary(&running, "demo", false); + let running_later_status = driver_status_from_summary(&running_later, "demo", false); + assert_eq!(running_status.conditions[0].status, "False"); + assert_eq!(running_status.conditions[0].reason, "DependenciesNotReady"); + assert_eq!( + running_status.conditions[0].message, + "Container is running; waiting for supervisor relay" + ); + assert_eq!(running_status.conditions, running_later_status.conditions); + + let exited_status = driver_status_from_summary(&exited, "demo", false); + assert_eq!(exited_status.conditions[0].status, "False"); + assert_eq!(exited_status.conditions[0].reason, "ContainerExited"); + assert_eq!(exited_status.conditions[0].message, "Container exited"); + + // With a live supervisor session, a RUNNING container flips Ready=True + // so ExecSandbox and other "sandbox must be ready" gates can proceed. + let running_connected = driver_status_from_summary(&running, "demo", true); + assert_eq!(running_connected.conditions[0].status, "True"); + assert_eq!( + running_connected.conditions[0].reason, + "SupervisorConnected" + ); + + // Supervisor readiness is ignored for non-RUNNING states -- an exited + // container must not report Ready=True. + let exited_connected = driver_status_from_summary(&exited, "demo", true); + assert_eq!(exited_connected.conditions[0].status, "False"); +} + +#[test] +fn driver_status_marks_restarting_sandboxes_as_error() { + let restarting = ContainerSummary { + id: Some("cid".to_string()), + names: Some(vec!["/openshell-demo".to_string()]), + labels: Some(HashMap::from([ + (SANDBOX_ID_LABEL_KEY.to_string(), "sbx-1".to_string()), + (SANDBOX_NAME_LABEL_KEY.to_string(), "demo".to_string()), + ( + SANDBOX_NAMESPACE_LABEL_KEY.to_string(), + "default".to_string(), + ), + ])), + state: Some(ContainerSummaryStateEnum::RESTARTING), + status: Some("Restarting (1) 2 seconds ago".to_string()), + ..Default::default() + }; + + let status = driver_status_from_summary(&restarting, "demo", false); + assert_eq!(status.conditions[0].status, "False"); + assert_eq!(status.conditions[0].reason, "ContainerRestarting"); + assert_eq!( + status.conditions[0].message, + "Container is restarting after a failure" + ); +} + +#[test] +fn validate_linux_elf_binary_rejects_non_elf_files() { + let tempdir = TempDir::new().unwrap(); + let path = tempdir.path().join("openshell-sandbox"); + fs::write(&path, b"not-elf").unwrap(); + + let err = validate_linux_elf_binary(&path).unwrap_err(); + assert!(err.to_string().contains("Linux ELF executable")); +} + +#[test] +fn docker_guest_tls_paths_require_all_files_for_https() { + let config = Config::new(None).with_grpc_endpoint("https://localhost:8443"); + let tempdir = TempDir::new().unwrap(); + let ca = tempdir.path().join("ca.crt"); + fs::write(&ca, b"ca").unwrap(); + + let err = docker_guest_tls_paths( + &config, + &DockerComputeConfig { + guest_tls_ca: Some(ca), + ..Default::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("--docker-tls-cert")); +} + +#[test] +fn linux_supervisor_candidates_follow_daemon_arch() { + assert_eq!( + linux_supervisor_candidates("amd64"), + vec![PathBuf::from( + "target/x86_64-unknown-linux-gnu/release/openshell-sandbox", + )] + ); + assert_eq!( + linux_supervisor_candidates("arm64"), + vec![PathBuf::from( + "target/aarch64-unknown-linux-gnu/release/openshell-sandbox", + )] + ); +} + +#[test] +fn container_name_preserves_id_suffix_for_long_names() { + // Names up to 253 chars are permitted by the gRPC layer. The id + // suffix is what makes the container name unique between sandboxes + // sharing a prefix, so it must always appear in the final name. + let long_name = "a".repeat(253); + let first = DriverSandbox { + id: "sbx-first-1234567890".to_string(), + name: long_name, + namespace: "default".to_string(), + spec: None, + status: None, + }; + let second = DriverSandbox { + id: "sbx-second-0987654321".to_string(), + ..first.clone() + }; + + let first_container = container_name_for_sandbox(&first); + let second_container = container_name_for_sandbox(&second); + + assert!( + first_container.len() <= MAX_CONTAINER_NAME_LEN, + "container name {} exceeded {MAX_CONTAINER_NAME_LEN} chars: {first_container}", + first_container.len(), + ); + assert!( + first_container.ends_with(&first.id), + "container name should end with sandbox id: {first_container}", + ); + assert_ne!( + first_container, second_container, + "container names must differ for sandboxes with distinct ids", + ); +} + +#[test] +fn container_name_empty_sandbox_name_uses_id_only() { + let sandbox = DriverSandbox { + id: "sbx-abc".to_string(), + name: String::new(), + namespace: "default".to_string(), + spec: None, + status: None, + }; + assert_eq!(container_name_for_sandbox(&sandbox), "openshell-sbx-abc",); +} + +#[test] +fn trim_container_name_tail_strips_separators() { + assert_eq!(trim_container_name_tail("foo-".to_string()), "foo"); + assert_eq!(trim_container_name_tail("foo-.".to_string()), "foo"); + assert_eq!(trim_container_name_tail("foo_-.".to_string()), "foo"); + assert_eq!(trim_container_name_tail("foo".to_string()), "foo"); +} + +#[test] +fn docker_guest_tls_paths_rejects_tls_flags_without_https() { + let config = Config::new(None).with_grpc_endpoint("http://localhost:8080"); + let tempdir = TempDir::new().unwrap(); + let ca = tempdir.path().join("ca.crt"); + fs::write(&ca, b"ca").unwrap(); + + let err = docker_guest_tls_paths( + &config, + &DockerComputeConfig { + guest_tls_ca: Some(ca), + ..Default::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("https://")); +} + +#[test] +fn docker_guest_tls_paths_allows_plain_http_without_tls_flags() { + let config = Config::new(None).with_grpc_endpoint("http://localhost:8080"); + let result = docker_guest_tls_paths(&config, &DockerComputeConfig::default()).unwrap(); + assert!(result.is_none()); +} + +#[test] +fn default_docker_supervisor_image_uses_nvidia_ghcr_repo() { + let image = default_docker_supervisor_image(); + assert!( + image.starts_with("ghcr.io/nvidia/openshell/supervisor:"), + "unexpected default image reference: {image}", + ); +} + +#[test] +fn docker_supervisor_image_tag_prefers_explicit_build_tags() { + assert_eq!( + resolve_default_docker_supervisor_image_tag(Some("1.2.3"), Some("sha"), "0.0.0"), + "1.2.3", + ); + assert_eq!( + resolve_default_docker_supervisor_image_tag(None, Some("sha"), "0.0.0"), + "sha", + ); + assert_eq!( + resolve_default_docker_supervisor_image_tag(None, None, "1.2.3"), + "1.2.3", + ); + assert_eq!( + resolve_default_docker_supervisor_image_tag(Some(""), Some(""), "0.0.0"), + "dev", + ); +} + +#[test] +fn supervisor_cache_path_namespaces_by_digest_under_openshell_data_dir() { + let base = PathBuf::from("/var/cache/share"); + let path = + supervisor_cache_path_with_base(&base, "sha256:abc123deadbeef0123456789cafe0123456789fe"); + + assert_eq!( + path, + PathBuf::from( + "/var/cache/share/openshell/docker-supervisor/sha256-abc123deadbeef0123456789cafe0123456789fe/openshell-sandbox", + ), + ); +} + +#[test] +fn supervisor_cache_path_isolates_different_digests() { + let base = PathBuf::from("/data"); + let left = supervisor_cache_path_with_base(&base, "sha256:aaaaaaaa"); + let right = supervisor_cache_path_with_base(&base, "sha256:bbbbbbbb"); + assert_ne!( + left.parent().unwrap(), + right.parent().unwrap(), + "digest-keyed directories must differ so rollouts are isolated", + ); +} + +#[test] +fn write_cache_binary_atomic_materializes_file_with_executable_mode() { + let tempdir = TempDir::new().unwrap(); + let target = tempdir.path().join("nested").join("openshell-sandbox"); + fs::create_dir_all(target.parent().unwrap()).unwrap(); + + write_cache_binary_atomic(&target, b"\x7fELFpayload").unwrap(); + + assert!(target.is_file()); + assert_eq!(fs::read(&target).unwrap(), b"\x7fELFpayload"); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = fs::metadata(&target).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o755, "expected 0755, got {mode:04o}"); + } +} + +#[test] +fn write_cache_binary_atomic_overwrites_existing_file() { + let tempdir = TempDir::new().unwrap(); + let target = tempdir.path().join("openshell-sandbox"); + fs::write(&target, b"stale").unwrap(); + + write_cache_binary_atomic(&target, b"\x7fELFfresh").unwrap(); + assert_eq!(fs::read(&target).unwrap(), b"\x7fELFfresh"); +} + +#[test] +fn temp_extract_container_names_are_unique_per_call() { + let first = temp_extract_container_name(); + let second = temp_extract_container_name(); + assert_ne!(first, second); + assert!(first.starts_with("openshell-supervisor-extract-")); +} + +#[test] +fn extract_first_tar_entry_returns_payload_of_single_file_archive() { + // Build a tar archive with the same shape Docker returns from + // `/containers//archive` for a single file. + let payload = b"\x7fELFtest-binary-bytes"; + let mut tar_buf = Vec::new(); + { + let mut builder = tar::Builder::new(&mut tar_buf); + let mut header = tar::Header::new_gnu(); + header.set_path("openshell-sandbox").unwrap(); + header.set_size(payload.len() as u64); + header.set_mode(0o755); + header.set_cksum(); + builder.append(&header, payload.as_slice()).unwrap(); + builder.finish().unwrap(); + } + + let extracted = extract_first_tar_entry(&tar_buf).unwrap(); + assert_eq!(extracted, payload); +} + +#[test] +fn extract_first_tar_entry_rejects_empty_archive() { + let mut tar_buf = Vec::new(); + tar::Builder::new(&mut tar_buf).finish().unwrap(); + let err = extract_first_tar_entry(&tar_buf).unwrap_err(); + assert!(err.contains("empty"), "unexpected error message: {err}"); +} + +#[test] +fn container_state_needs_resume_matches_startable_states() { + for state in [ + ContainerSummaryStateEnum::EXITED, + ContainerSummaryStateEnum::CREATED, + ] { + assert!( + container_state_needs_resume(state), + "{state:?} should be resumed with Docker start", + ); + } + + for state in [ + ContainerSummaryStateEnum::RUNNING, + ContainerSummaryStateEnum::RESTARTING, + ContainerSummaryStateEnum::PAUSED, + ContainerSummaryStateEnum::DEAD, + ContainerSummaryStateEnum::REMOVING, + ContainerSummaryStateEnum::EMPTY, + ] { + assert!( + !container_state_needs_resume(state), + "{state:?} should not be resumed with Docker start", + ); + } +} diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index c2c392f33..28bc46257 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -16,6 +16,7 @@ path = "src/main.rs" [dependencies] openshell-core = { path = "../openshell-core" } +openshell-driver-docker = { path = "../openshell-driver-docker" } openshell-driver-kubernetes = { path = "../openshell-driver-kubernetes" } openshell-driver-podman = { path = "../openshell-driver-podman" } openshell-ocsf = { path = "../openshell-ocsf" } @@ -71,7 +72,6 @@ tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } uuid = { workspace = true } -url = { workspace = true } hmac = "0.12" sha2 = "0.10" hex = "0.4" @@ -79,6 +79,7 @@ russh = "0.57" rand = "0.9" petname = "2" ipnet = "2" +tempfile = "3" [features] dev-settings = ["openshell-core/dev-settings"] @@ -86,7 +87,6 @@ dev-settings = ["openshell-core/dev-settings"] [dev-dependencies] hyper-rustls = { version = "0.27", default-features = false, features = ["native-tokio", "http1", "tls12", "logging", "ring", "webpki-tokio"] } rcgen = { version = "0.13", features = ["crypto", "pem"] } -tempfile = "3" tokio-tungstenite = { workspace = true } futures-util = "0.3" wiremock = "0.6" diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 2e6e2823b..2df0b06f4 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -14,7 +14,7 @@ use std::path::PathBuf; use tracing::info; use tracing_subscriber::EnvFilter; -use crate::compute::VmComputeConfig; +use crate::compute::{DockerComputeConfig, VmComputeConfig}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -177,6 +177,33 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] vm_tls_key: Option, + /// Linux `openshell-sandbox` binary bind-mounted into Docker sandboxes. + /// + /// When unset the gateway falls back to (in order) a sibling + /// `openshell-sandbox` next to the gateway binary, a local cargo build, + /// or extracting the binary from `--docker-supervisor-image`. + #[arg(long, env = "OPENSHELL_DOCKER_SUPERVISOR_BIN")] + docker_supervisor_bin: Option, + + /// Image the Docker driver pulls to extract the Linux + /// `openshell-sandbox` binary when no explicit `--docker-supervisor-bin` + /// override or local build is available. Defaults to + /// `ghcr.io/nvidia/openshell/supervisor:`. + #[arg(long, env = "OPENSHELL_DOCKER_SUPERVISOR_IMAGE")] + docker_supervisor_image: Option, + + /// CA certificate bind-mounted into Docker sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_DOCKER_TLS_CA")] + docker_tls_ca: Option, + + /// Client certificate bind-mounted into Docker sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_DOCKER_TLS_CERT")] + docker_tls_cert: Option, + + /// Client private key bind-mounted into Docker sandboxes for gateway mTLS. + #[arg(long, env = "OPENSHELL_DOCKER_TLS_KEY")] + docker_tls_key: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -315,6 +342,14 @@ async fn run_from_args(args: Args) -> Result<()> { guest_tls_key: args.vm_tls_key, }; + let docker_config = DockerComputeConfig { + supervisor_bin: args.docker_supervisor_bin, + supervisor_image: args.docker_supervisor_image, + guest_tls_ca: args.docker_tls_ca, + guest_tls_cert: args.docker_tls_cert, + guest_tls_key: args.docker_tls_key, + }; + if args.disable_tls { info!("TLS disabled — listening on plaintext HTTP"); } else if args.disable_gateway_auth { @@ -323,7 +358,7 @@ async fn run_from_args(args: Args) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, vm_config, tracing_log_bus) + run_server(config, vm_config, docker_config, tracing_log_bus) .await .into_diagnostic() } diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 473106839..60db54666 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -5,6 +5,7 @@ pub mod vm; +pub use openshell_driver_docker::DockerComputeConfig; pub use vm::VmComputeConfig; use crate::grpc::policy::{SANDBOX_SETTINGS_OBJECT_TYPE, sandbox_settings_id}; @@ -26,6 +27,7 @@ use openshell_core::proto::{ PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, SandboxTemplate, SshSession, }; +use openshell_driver_docker::DockerComputeDriver; use openshell_driver_kubernetes::{ ComputeDriverService, KubernetesComputeConfig, KubernetesComputeDriver, }; @@ -46,6 +48,45 @@ type DriverWatchStream = Pin + Send + Sync>; +#[tonic::async_trait] +trait ShutdownCleanup: Send + Sync { + async fn cleanup_on_shutdown(&self) -> Result<(), String>; +} + +#[tonic::async_trait] +impl ShutdownCleanup for DockerComputeDriver { + async fn cleanup_on_shutdown(&self) -> Result<(), String> { + let stopped = self + .stop_managed_containers_on_shutdown() + .await + .map_err(|err| err.to_string())?; + info!( + stopped_containers = stopped, + "Stopped Docker sandbox containers during gateway shutdown" + ); + Ok(()) + } +} + +/// Resume a single sandbox whose store record indicates it should be +/// running. Implemented by drivers (currently only Docker) where compute +/// resources do not auto-restart with the gateway. Returns `Ok(true)` if +/// the backend resource was found and resumed (or was already running), +/// `Ok(false)` if no backend resource exists. +#[tonic::async_trait] +trait StartupResume: Send + Sync { + async fn resume_sandbox(&self, sandbox_id: &str, sandbox_name: &str) -> Result; +} + +#[tonic::async_trait] +impl StartupResume for DockerComputeDriver { + async fn resume_sandbox(&self, sandbox_id: &str, sandbox_name: &str) -> Result { + DockerComputeDriver::resume_sandbox(self, sandbox_id, sandbox_name) + .await + .map_err(|err| err.to_string()) + } +} + /// Interval between store-vs-backend reconciliation sweeps. const RECONCILE_INTERVAL: Duration = Duration::from_secs(60); @@ -180,6 +221,8 @@ impl ComputeDriver for RemoteComputeDriver { #[derive(Clone)] pub struct ComputeRuntime { driver: SharedComputeDriver, + shutdown_cleanup: Option>, + startup_resume: Option>, _driver_process: Option>, default_image: String, store: Arc, @@ -197,15 +240,18 @@ impl fmt::Debug for ComputeRuntime { } impl ComputeRuntime { + #[allow(clippy::too_many_arguments)] async fn from_driver( driver: SharedComputeDriver, + shutdown_cleanup: Option>, + startup_resume: Option>, driver_process: Option>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, - allows_loopback_endpoints: bool, + _allows_loopback_endpoints: bool, ) -> Result { let default_image = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) @@ -215,6 +261,8 @@ impl ComputeRuntime { .default_image; Ok(Self { driver, + shutdown_cleanup, + startup_resume, _driver_process: driver_process, default_image, store, @@ -226,6 +274,38 @@ impl ComputeRuntime { }) } + pub async fn new_docker( + config: openshell_core::Config, + docker_config: DockerComputeConfig, + store: Arc, + sandbox_index: SandboxIndex, + sandbox_watch_bus: SandboxWatchBus, + tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, + ) -> Result { + let driver = Arc::new( + DockerComputeDriver::new(&config, &docker_config, supervisor_sessions.clone()) + .await + .map_err(|err| ComputeError::Message(err.to_string()))?, + ); + let shutdown_cleanup: Arc = driver.clone(); + let startup_resume: Arc = driver.clone(); + let driver: SharedComputeDriver = driver; + Self::from_driver( + driver, + Some(shutdown_cleanup), + Some(startup_resume), + None, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + true, + ) + .await + } + pub async fn new_kubernetes( config: KubernetesComputeConfig, store: Arc, @@ -241,6 +321,8 @@ impl ComputeRuntime { Self::from_driver( driver, None, + None, + None, store, sandbox_index, sandbox_watch_bus, @@ -263,6 +345,8 @@ impl ComputeRuntime { let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); Self::from_driver( driver, + None, + None, driver_process, store, sandbox_index, @@ -289,6 +373,8 @@ impl ComputeRuntime { Self::from_driver( driver, None, + None, + None, store, sandbox_index, sandbox_watch_bus, @@ -455,6 +541,142 @@ impl ComputeRuntime { }); } + pub async fn cleanup_on_shutdown(&self) -> Result<(), String> { + let Some(cleanup) = &self.shutdown_cleanup else { + return Ok(()); + }; + cleanup.cleanup_on_shutdown().await + } + + /// Resume sandboxes whose store records say they should be running. + /// Drivers that do not auto-restart compute resources across gateway + /// restarts (currently only Docker) implement `StartupResume`. For + /// each sandbox in the store whose phase is not `Deleting` or + /// `Error`, we ask the driver to resume the underlying resource. If + /// the driver reports that the resource no longer exists or fails to + /// start, the sandbox is moved to the `Error` phase so the failure + /// surfaces in the UI. + /// + /// Should be called once at gateway startup, before watchers spawn, + /// so the watch loop sees the post-resume state on its first poll. + pub async fn resume_persisted_sandboxes(&self) -> Result<(), String> { + let Some(resume) = &self.startup_resume else { + return Ok(()); + }; + + let records = self + .store + .list(Sandbox::object_type(), 1000, 0) + .await + .map_err(|e| e.to_string())?; + + let mut resumed = 0usize; + let mut missing = 0usize; + let mut failed = 0usize; + + for record in records { + let sandbox = match Sandbox::decode(record.payload.as_slice()) { + Ok(sandbox) => sandbox, + Err(err) => { + warn!(error = %err, "Failed to decode sandbox record during startup resume"); + continue; + } + }; + + let phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + if !sandbox_phase_should_be_running(phase) { + continue; + } + + match resume + .resume_sandbox(sandbox.object_id(), sandbox.object_name()) + .await + { + Ok(true) => { + info!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + ?phase, + "Resumed sandbox during gateway startup" + ); + resumed += 1; + } + Ok(false) => { + // Backend resource is gone but the store still + // remembers the sandbox. Mark Error so the UI + // surfaces the inconsistency; the reconcile loop + // will eventually prune it after the orphan grace + // period. + warn!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + "Cannot resume sandbox: backend resource is missing" + ); + self.mark_sandbox_error( + &sandbox, + "BackendResourceMissing", + "Sandbox container disappeared while the gateway was offline", + ) + .await; + missing += 1; + } + Err(err) => { + warn!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + error = %err, + "Failed to resume sandbox during gateway startup" + ); + self.mark_sandbox_error( + &sandbox, + "ResumeFailed", + &format!("Failed to resume sandbox during gateway startup: {err}"), + ) + .await; + failed += 1; + } + } + } + + if resumed > 0 || missing > 0 || failed > 0 { + info!( + resumed, + missing_backend = missing, + failed, + "Sandbox resume sweep complete" + ); + } + Ok(()) + } + + async fn mark_sandbox_error(&self, sandbox: &Sandbox, reason: &str, message: &str) { + let _guard = self.sync_lock.lock().await; + let mut updated = sandbox.clone(); + updated.phase = SandboxPhase::Error as i32; + let updated_name = updated.object_name().to_string(); + upsert_ready_condition( + &mut updated.status, + &updated_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: reason.to_string(), + message: message.to_string(), + last_transition_time: String::new(), + }, + ); + self.sandbox_index.update_from_sandbox(&updated); + if let Err(err) = self.store.put_message(&updated).await { + warn!( + sandbox_id = %sandbox.object_id(), + error = %err, + "Failed to persist sandbox error state during startup resume" + ); + return; + } + self.sandbox_watch_bus.notify(sandbox.object_id()); + } + async fn watch_loop(self: Arc) { loop { let mut stream = match self @@ -978,14 +1200,14 @@ fn build_platform_config(template: &SandboxTemplate) -> Option Option, +) -> Option { + use prost_types::{Struct, Value, value::Kind}; + + let resources = resources.as_ref()?; + let mut fields = std::collections::BTreeMap::new(); + + for (section_name, value) in &resources.fields { + if !matches!(section_name.as_str(), "limits" | "requests") { + fields.insert(section_name.clone(), value.clone()); + continue; + } + + let Some(Kind::StructValue(section)) = value.kind.as_ref() else { + fields.insert(section_name.clone(), value.clone()); + continue; + }; + + let section_fields = section + .fields + .iter() + .filter_map(|(resource_name, resource_value)| { + let is_typed_quantity = matches!(resource_name.as_str(), "cpu" | "memory") + && matches!(resource_value.kind.as_ref(), Some(Kind::StringValue(_))); + if is_typed_quantity { + None + } else { + Some((resource_name.clone(), resource_value.clone())) + } + }) + .collect::>(); + + if !section_fields.is_empty() { + fields.insert( + section_name.clone(), + Value { + kind: Some(Kind::StructValue(Struct { + fields: section_fields, + })), + }, + ); + } + } + + if fields.is_empty() { + None + } else { + Some(Struct { fields }) + } +} + fn driver_status_from_public(status: &SandboxStatus, phase: i32) -> DriverSandboxStatus { DriverSandboxStatus { sandbox_name: status.sandbox_name.clone(), @@ -1181,6 +1455,17 @@ fn rewrite_user_facing_conditions(status: &mut Option, spec: Opti } } +/// Phases for which a sandbox should have a running compute resource. +/// `Deleting` and `Error` are intentionally excluded: deletion is in +/// progress, or the sandbox has already failed and should not be +/// silently revived. +fn sandbox_phase_should_be_running(phase: SandboxPhase) -> bool { + matches!( + phase, + SandboxPhase::Provisioning | SandboxPhase::Ready | SandboxPhase::Unknown + ) +} + fn is_terminal_failure_reason(reason: &str) -> bool { let reason = reason.to_ascii_lowercase(); let transient_reasons = [ @@ -1205,6 +1490,31 @@ mod tests { use std::sync::Arc; use tokio::sync::{mpsc, oneshot}; + fn string_value(value: &str) -> prost_types::Value { + prost_types::Value { + kind: Some(prost_types::value::Kind::StringValue(value.to_string())), + } + } + + fn number_value(value: f64) -> prost_types::Value { + prost_types::Value { + kind: Some(prost_types::value::Kind::NumberValue(value)), + } + } + + fn struct_value( + fields: impl IntoIterator, prost_types::Value)>, + ) -> prost_types::Value { + prost_types::Value { + kind: Some(prost_types::value::Kind::StructValue(prost_types::Struct { + fields: fields + .into_iter() + .map(|(key, value)| (key.into(), value)) + .collect(), + })), + } + } + #[derive(Debug, Default)] struct TestDriver { listed_sandboxes: Vec, @@ -1310,9 +1620,18 @@ mod tests { } async fn test_runtime(driver: SharedComputeDriver) -> ComputeRuntime { + test_runtime_with_resume(driver, None).await + } + + async fn test_runtime_with_resume( + driver: SharedComputeDriver, + startup_resume: Option>, + ) -> ComputeRuntime { let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); ComputeRuntime { driver, + shutdown_cleanup: None, + startup_resume, _driver_process: None, default_image: "openshell/sandbox:test".to_string(), store, @@ -1478,6 +1797,123 @@ mod tests { assert_eq!(derive_phase(Some(&status)), SandboxPhase::Ready); } + #[test] + fn build_platform_config_omits_typed_cpu_and_memory_resources() { + let template = SandboxTemplate { + resources: Some(prost_types::Struct { + fields: [ + ( + "limits", + struct_value([("cpu", string_value("2")), ("memory", string_value("1Gi"))]), + ), + ( + "requests", + struct_value([ + ("cpu", string_value("500m")), + ("memory", string_value("512Mi")), + ]), + ), + ] + .into_iter() + .map(|(key, value)| (key.to_string(), value)) + .collect(), + }), + ..Default::default() + }; + + assert!(build_platform_config(&template).is_none()); + } + + #[test] + fn build_platform_config_preserves_non_typed_resource_fields() { + let template = SandboxTemplate { + resources: Some(prost_types::Struct { + fields: [ + ( + "limits", + struct_value([ + ("cpu", string_value("2")), + ("memory", string_value("1Gi")), + ("nvidia.com/gpu", string_value("1")), + ]), + ), + ( + "requests", + struct_value([ + ("cpu", string_value("500m")), + ("memory", string_value("512Mi")), + ("hugepages-2Mi", string_value("4Mi")), + ]), + ), + ("opaque_cpu", number_value(2.0)), + ] + .into_iter() + .map(|(key, value)| (key.to_string(), value)) + .collect(), + }), + ..Default::default() + }; + + let platform_config = build_platform_config(&template).unwrap(); + let resources_raw = platform_config + .fields + .get("resources_raw") + .and_then(|value| value.kind.as_ref()) + .and_then(|kind| match kind { + prost_types::value::Kind::StructValue(inner) => Some(inner), + _ => None, + }) + .unwrap(); + + let limits = resources_raw + .fields + .get("limits") + .and_then(|value| value.kind.as_ref()) + .and_then(|kind| match kind { + prost_types::value::Kind::StructValue(inner) => Some(inner), + _ => None, + }) + .unwrap(); + assert!(!limits.fields.contains_key("cpu")); + assert!(!limits.fields.contains_key("memory")); + assert_eq!( + limits + .fields + .get("nvidia.com/gpu") + .and_then(|value| value.kind.as_ref()) + .and_then(|kind| match kind { + prost_types::value::Kind::StringValue(value) => Some(value.as_str()), + _ => None, + }), + Some("1") + ); + + let requests = resources_raw + .fields + .get("requests") + .and_then(|value| value.kind.as_ref()) + .and_then(|kind| match kind { + prost_types::value::Kind::StructValue(inner) => Some(inner), + _ => None, + }) + .unwrap(); + assert!(!requests.fields.contains_key("cpu")); + assert!(!requests.fields.contains_key("memory")); + assert_eq!( + requests + .fields + .get("hugepages-2Mi") + .and_then(|value| value.kind.as_ref()) + .and_then(|kind| match kind { + prost_types::value::Kind::StringValue(value) => Some(value.as_str()), + _ => None, + }), + Some("4Mi") + ); + + assert!(resources_raw.fields.contains_key("opaque_cpu")); + } + #[test] fn rewrite_user_facing_conditions_rewrites_gpu_unschedulable_message() { let mut status = Some(SandboxStatus { @@ -1910,4 +2346,162 @@ mod tests { Err(tokio::sync::broadcast::error::TryRecvError::Closed) )); } + + #[derive(Default)] + struct RecordingResume { + calls: tokio::sync::Mutex>, + results: tokio::sync::Mutex>>, + } + + impl RecordingResume { + async fn set_result(&self, sandbox_id: &str, result: Result) { + self.results + .lock() + .await + .insert(sandbox_id.to_string(), result); + } + + async fn calls(&self) -> Vec<(String, String)> { + self.calls.lock().await.clone() + } + } + + #[tonic::async_trait] + impl StartupResume for RecordingResume { + async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + ) -> Result { + self.calls + .lock() + .await + .push((sandbox_id.to_string(), sandbox_name.to_string())); + self.results + .lock() + .await + .get(sandbox_id) + .cloned() + .unwrap_or(Ok(true)) + } + } + + #[tokio::test] + async fn resume_persisted_sandboxes_resumes_running_phases() { + let resume = Arc::new(RecordingResume::default()); + let runtime = + test_runtime_with_resume(Arc::new(TestDriver::default()), Some(resume.clone())).await; + + for (id, name, phase) in [ + ("sb-prov", "prov", SandboxPhase::Provisioning), + ("sb-ready", "ready", SandboxPhase::Ready), + ("sb-unknown", "unknown", SandboxPhase::Unknown), + ("sb-deleting", "deleting", SandboxPhase::Deleting), + ("sb-error", "error", SandboxPhase::Error), + ] { + let sandbox = sandbox_record(id, name, phase); + runtime.store.put_message(&sandbox).await.unwrap(); + } + + runtime.resume_persisted_sandboxes().await.unwrap(); + + let mut called_ids = resume + .calls() + .await + .into_iter() + .map(|(id, _)| id) + .collect::>(); + called_ids.sort(); + assert_eq!( + called_ids, + vec![ + "sb-prov".to_string(), + "sb-ready".to_string(), + "sb-unknown".to_string(), + ] + ); + } + + #[tokio::test] + async fn resume_persisted_sandboxes_marks_missing_backend_as_error() { + let resume = Arc::new(RecordingResume::default()); + resume.set_result("sb-1", Ok(false)).await; + let runtime = + test_runtime_with_resume(Arc::new(TestDriver::default()), Some(resume.clone())).await; + + let sandbox = sandbox_record("sb-1", "missing", SandboxPhase::Ready); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime.resume_persisted_sandboxes().await.unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Error + ); + let ready = stored + .status + .as_ref() + .and_then(|s| s.conditions.iter().find(|c| c.r#type == "Ready")) + .expect("Ready condition present"); + assert_eq!(ready.reason, "BackendResourceMissing"); + } + + #[tokio::test] + async fn resume_persisted_sandboxes_marks_failed_resume_as_error() { + let resume = Arc::new(RecordingResume::default()); + resume + .set_result("sb-1", Err("docker daemon angry".to_string())) + .await; + let runtime = + test_runtime_with_resume(Arc::new(TestDriver::default()), Some(resume.clone())).await; + + let sandbox = sandbox_record("sb-1", "broken", SandboxPhase::Provisioning); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime.resume_persisted_sandboxes().await.unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Error + ); + let ready = stored + .status + .as_ref() + .and_then(|s| s.conditions.iter().find(|c| c.r#type == "Ready")) + .expect("Ready condition present"); + assert_eq!(ready.reason, "ResumeFailed"); + assert!(ready.message.contains("docker daemon angry")); + } + + #[tokio::test] + async fn resume_persisted_sandboxes_is_noop_without_resume_hook() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "anywhere", SandboxPhase::Ready); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime.resume_persisted_sandboxes().await.unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Ready + ); + } } diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 4a5ca55bd..d587c0bb3 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -42,9 +42,9 @@ use std::io::ErrorKind; use std::sync::{Arc, Mutex}; use std::time::Duration; use tokio::net::TcpListener; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, warn}; -use compute::{ComputeRuntime, VmComputeConfig}; +use compute::{ComputeRuntime, DockerComputeConfig, VmComputeConfig}; pub use grpc::OpenShellService; pub use http::{health_router, http_router, metrics_router}; pub use multiplex::{MultiplexService, MultiplexedService}; @@ -89,6 +89,10 @@ pub struct ServerState { pub settings_mutex: tokio::sync::Mutex<()>, /// Registry of active supervisor sessions and pending relay channels. + /// + /// Stored as `Arc` so compute drivers (e.g. the Docker driver) + /// can be constructed before `ServerState` and still + /// query session state to surface supervisor readiness. pub supervisor_sessions: Arc, } @@ -136,13 +140,15 @@ impl ServerState { pub async fn run_server( config: Config, vm_config: VmComputeConfig, + docker_config: DockerComputeConfig, tracing_log_bus: TracingLogBus, ) -> Result<()> { let database_url = config.database_url.trim(); if database_url.is_empty() { return Err(Error::config("database_url is required")); } - if config.ssh_handshake_secret.is_empty() { + let driver = configured_compute_driver(&config)?; + if config.ssh_handshake_secret.is_empty() && driver != ComputeDriverKind::Docker { return Err(Error::config( "ssh_handshake_secret is required. Set --ssh-handshake-secret or OPENSHELL_SSH_HANDSHAKE_SECRET", )); @@ -156,6 +162,7 @@ pub async fn run_server( let compute = build_compute_runtime( &config, &vm_config, + &docker_config, store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -173,6 +180,14 @@ pub async fn run_server( supervisor_sessions, )); + // Resume sandboxes that were stopped during the previous gateway + // shutdown so the running compute state matches the persisted store. + // Runs before watchers spawn so the watch loop sees the post-resume + // snapshot on its first poll. + if let Err(err) = state.compute.resume_persisted_sandboxes().await { + warn!(error = %err, "Failed to resume persisted sandboxes during startup"); + } + state.compute.spawn_watchers(); ssh_tunnel::spawn_session_reaper(store.clone(), Duration::from_secs(3600)); supervisor_session::spawn_relay_reaper(state.clone(), Duration::from_secs(30)); @@ -244,13 +259,24 @@ pub async fn run_server( None }; - // Accept connections + let shutdown = shutdown_signal(); + tokio::pin!(shutdown); + + // Accept connections until the gateway receives a graceful shutdown signal. loop { - let (stream, addr) = match listener.accept().await { - Ok(conn) => conn, - Err(e) => { - error!(error = %e, "Failed to accept connection"); - continue; + let (stream, addr) = tokio::select! { + _ = &mut shutdown => { + info!("Shutdown signal received; stopping gateway"); + break; + } + accepted = listener.accept() => { + match accepted { + Ok(conn) => conn, + Err(e) => { + error!(error = %e, "Failed to accept connection"); + continue; + } + } } }; @@ -282,11 +308,53 @@ pub async fn run_server( }); } } + + state + .compute + .cleanup_on_shutdown() + .await + .map_err(|err| Error::execution(format!("gateway shutdown cleanup failed: {err}")))?; + + Ok(()) +} + +async fn shutdown_signal() { + #[cfg(unix)] + { + tokio::select! { + _ = ctrl_c_signal() => {} + _ = terminate_signal() => {} + } + } + + #[cfg(not(unix))] + { + ctrl_c_signal().await; + } +} + +async fn ctrl_c_signal() { + if let Err(err) = tokio::signal::ctrl_c().await { + warn!(error = %err, "Failed to install Ctrl-C signal handler"); + std::future::pending::<()>().await; + } +} + +#[cfg(unix)] +async fn terminate_signal() { + let Ok(mut signal) = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + else { + warn!("Failed to install SIGTERM signal handler"); + std::future::pending::<()>().await; + return; + }; + let _ = signal.recv().await; } async fn build_compute_runtime( config: &Config, vm_config: &VmComputeConfig, + docker_config: &DockerComputeConfig, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -323,6 +391,17 @@ async fn build_compute_runtime( ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), + ComputeDriverKind::Docker => ComputeRuntime::new_docker( + config.clone(), + docker_config.clone(), + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), ComputeDriverKind::Vm => { let (channel, driver_process) = compute::vm::spawn(config, vm_config).await?; ComputeRuntime::new_remote_vm( @@ -392,11 +471,10 @@ fn configured_compute_driver(config: &Config) -> Result { [] => Err(Error::config( "at least one compute driver must be configured", )), - [ - driver @ (ComputeDriverKind::Kubernetes - | ComputeDriverKind::Vm - | ComputeDriverKind::Podman), - ] => Ok(*driver), + [driver @ ComputeDriverKind::Kubernetes] + | [driver @ ComputeDriverKind::Vm] + | [driver @ ComputeDriverKind::Docker] + | [driver @ ComputeDriverKind::Podman] => Ok(*driver), drivers => Err(Error::config(format!( "multiple compute drivers are not supported yet; configured drivers: {}", drivers @@ -470,4 +548,13 @@ mod tests { ComputeDriverKind::Vm ); } + + #[test] + fn configured_compute_driver_accepts_docker() { + let config = Config::new(None).with_compute_drivers([ComputeDriverKind::Docker]); + assert_eq!( + configured_compute_driver(&config).unwrap(), + ComputeDriverKind::Docker + ); + } } diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index e712b6b5d..cd250459a 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -61,6 +61,12 @@ struct LiveSession { /// Holds a oneshot sender that will deliver the upgraded relay stream. type RelayStreamSender = oneshot::Sender; +impl openshell_driver_docker::SupervisorReadiness for SupervisorSessionRegistry { + fn is_supervisor_connected(&self, sandbox_id: &str) -> bool { + Self::is_connected(self, sandbox_id) + } +} + /// Registry of active supervisor sessions and pending relay channels. #[derive(Default)] pub struct SupervisorSessionRegistry { @@ -126,6 +132,14 @@ impl SupervisorSessionRegistry { } } + /// Report whether a live supervisor session is registered for a sandbox. + /// + /// Used by compute drivers that need to surface "supervisor relay ready" + /// through the Ready condition without polling the sandbox runtime. + pub fn is_connected(&self, sandbox_id: &str) -> bool { + self.sessions.lock().unwrap().contains_key(sandbox_id) + } + /// Remove the session for a sandbox. fn remove(&self, sandbox_id: &str) { self.sessions.lock().unwrap().remove(sandbox_id); diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 1871219aa..300dd1b46 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -54,6 +54,7 @@ COPY Cargo.toml Cargo.lock ./ COPY crates/openshell-bootstrap/Cargo.toml crates/openshell-bootstrap/Cargo.toml COPY crates/openshell-cli/Cargo.toml crates/openshell-cli/Cargo.toml COPY crates/openshell-core/Cargo.toml crates/openshell-core/Cargo.toml +COPY crates/openshell-driver-docker/Cargo.toml crates/openshell-driver-docker/Cargo.toml COPY crates/openshell-driver-kubernetes/Cargo.toml crates/openshell-driver-kubernetes/Cargo.toml COPY crates/openshell-driver-podman/Cargo.toml crates/openshell-driver-podman/Cargo.toml COPY crates/openshell-ocsf/Cargo.toml crates/openshell-ocsf/Cargo.toml @@ -72,6 +73,7 @@ RUN mkdir -p \ crates/openshell-bootstrap/src \ crates/openshell-cli/src \ crates/openshell-core/src \ + crates/openshell-driver-docker/src \ crates/openshell-driver-kubernetes/src \ crates/openshell-driver-podman/src \ crates/openshell-ocsf/src \ @@ -86,6 +88,7 @@ RUN mkdir -p \ touch crates/openshell-bootstrap/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-cli/src/main.rs && \ touch crates/openshell-core/src/lib.rs && \ + touch crates/openshell-driver-docker/src/lib.rs && \ touch crates/openshell-driver-kubernetes/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-driver-kubernetes/src/main.rs && \ touch crates/openshell-driver-podman/src/lib.rs && \ @@ -124,6 +127,7 @@ FROM rust-deps AS gateway-workspace ARG OPENSHELL_CARGO_VERSION COPY crates/openshell-core/ crates/openshell-core/ +COPY crates/openshell-driver-docker/ crates/openshell-driver-docker/ COPY crates/openshell-driver-kubernetes/ crates/openshell-driver-kubernetes/ COPY crates/openshell-driver-podman/ crates/openshell-driver-podman/ COPY crates/openshell-ocsf/ crates/openshell-ocsf/ diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh new file mode 100755 index 000000000..1c32ed4d1 --- /dev/null +++ b/e2e/rust/e2e-docker.sh @@ -0,0 +1,320 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run the Rust e2e smoke test against a standalone gateway running the +# bundled Docker compute driver. +# +# Unlike the Kubernetes driver (which deploys a k3s cluster) or the VM +# driver (which boots libkrun), the Docker driver runs in-process inside +# the gateway binary and uses the local Docker daemon to run sandbox +# containers. This script: +# +# 1. Builds openshell-gateway, openshell-cli, and a Linux ELF +# openshell-sandbox binary (via the Docker image build pipeline on +# non-Linux hosts so macOS linker fd limits are avoided). +# 2. Ensures the sandbox base image exists locally; containers launch +# from it with the freshly built openshell-sandbox binary bind-mounted +# over the image-provided copy. +# 3. Generates an ephemeral mTLS PKI (CA, server cert, client cert). +# 4. Starts openshell-gateway with --drivers=docker, binding to a +# random free host port. +# 5. Installs the client cert into the CLI gateway config dir and +# runs the Rust smoke test. +# 6. Tears the gateway process down on exit. +# +# Usage: mise run e2e:docker + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +WORKDIR="$(mktemp -d "/tmp/openshell-e2e-docker.XXXXXX")" +GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" +CLI_BIN="${ROOT}/target/debug/openshell" +STATE_DIR="" +GATEWAY_CONFIG_DIR="" +GATEWAY_PID="" +GATEWAY_LOG="${WORKDIR}/gateway.log" +# Unique sandbox namespace for this test run. Set just before the gateway +# is started so cleanup can filter Docker containers strictly to ones +# this run created, even when other OpenShell sandboxes are present on +# the host. Empty until assigned -- cleanup treats an empty namespace as +# "do nothing" so an early-exit trap never touches unrelated containers. +E2E_NAMESPACE="" + +cleanup() { + local exit_code=$? + if [ -n "${GATEWAY_PID}" ] && kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "Stopping openshell-gateway (pid ${GATEWAY_PID})..." + kill "${GATEWAY_PID}" 2>/dev/null || true + wait "${GATEWAY_PID}" 2>/dev/null || true + fi + + # On failure, preserve sandbox container logs for post-mortem + # debugging before removing the containers. Filter strictly to + # containers this test run created (managed-by + this run's unique + # sandbox namespace) so we never touch unrelated OpenShell sandboxes + # the developer may be running in parallel. + if [ "${exit_code}" -ne 0 ] \ + && [ -n "${E2E_NAMESPACE}" ] \ + && command -v docker >/dev/null 2>&1; then + local ids + ids=$(docker ps -aq \ + --filter "label=openshell.ai/managed-by=openshell" \ + --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ + 2>/dev/null || true) + if [ -n "${ids}" ]; then + echo "=== sandbox container logs (preserved for debugging) ===" + for id in ${ids}; do + echo "--- container ${id} (inspect) ---" + docker inspect --format '{{.Name}} state={{.State.Status}} exit={{.State.ExitCode}} restarts={{.RestartCount}} error={{.State.Error}}' "${id}" 2>/dev/null || true + echo "--- container ${id} (last 80 log lines) ---" + docker logs --tail 80 "${id}" 2>&1 || true + done + echo "=== end sandbox container logs ===" + fi + fi + + # Remove any lingering sandbox containers the gateway failed to clean + # up. Scope the filter to this run's namespace so we don't force-remove + # sandboxes belonging to other gateways or test runs on the same host. + if [ -n "${E2E_NAMESPACE}" ] && command -v docker >/dev/null 2>&1; then + local stale + stale=$(docker ps -aq \ + --filter "label=openshell.ai/managed-by=openshell" \ + --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ + 2>/dev/null || true) + if [ -n "${stale}" ]; then + # shellcheck disable=SC2086 + docker rm -f ${stale} >/dev/null 2>&1 || true + fi + fi + + if [ "${exit_code}" -ne 0 ] && [ -f "${GATEWAY_LOG}" ]; then + echo "=== gateway log (preserved for debugging) ===" + cat "${GATEWAY_LOG}" + echo "=== end gateway log ===" + fi + + # Remove gateway CLI config we created so repeated runs don't + # accumulate stale gateway entries. + if [ -n "${GATEWAY_CONFIG_DIR}" ] && [ -d "${GATEWAY_CONFIG_DIR}" ]; then + rm -rf "${GATEWAY_CONFIG_DIR}" + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true +} +trap cleanup EXIT + +# ── Preflight ──────────────────────────────────────────────────────── +if ! command -v docker >/dev/null 2>&1; then + echo "ERROR: docker CLI is required to run e2e:docker" >&2 + exit 2 +fi +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable (docker info failed)" >&2 + exit 2 +fi +if ! command -v openssl >/dev/null 2>&1; then + echo "ERROR: openssl is required to generate ephemeral PKI" >&2 + exit 2 +fi + +normalize_arch() { + case "$1" in + x86_64|amd64) echo "amd64" ;; + aarch64|arm64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +linux_target_triple() { + case "$1" in + amd64) echo "x86_64-unknown-linux-gnu" ;; + arm64) echo "aarch64-unknown-linux-gnu" ;; + *) + echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 + exit 2 + ;; + esac +} + +# Detect Linux arch of the Docker daemon so we build the matching +# openshell-sandbox binary. +DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" +SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" +HOST_OS="$(uname -s)" +HOST_ARCH="$(normalize_arch "$(uname -m)")" +SUPERVISOR_OUT_DIR="${WORKDIR}/supervisor/${DAEMON_ARCH}" +SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" + +# ── Build binaries ─────────────────────────────────────────────────── +# Cap build parallelism to avoid OOM when run alongside a docker build or +# on memory-constrained developer machines. Override with CARGO_BUILD_JOBS. +CARGO_BUILD_JOBS_ARG=() +if [ -n "${CARGO_BUILD_JOBS:-}" ]; then + CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") +fi + +echo "Building openshell-gateway and openshell-cli..." +cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-server --bin openshell-gateway \ + -p openshell-cli --features openshell-core/dev-settings + +echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." +mkdir -p "${SUPERVISOR_OUT_DIR}" +if [ "${HOST_OS}" = "Linux" ] && [ "${HOST_ARCH}" = "${DAEMON_ARCH}" ]; then + rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true + cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" + cp "${ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" "${SUPERVISOR_BIN}" +else + CONTAINER_ENGINE=docker \ + DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ + DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output +fi + +if [ ! -f "${SUPERVISOR_BIN}" ]; then + echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 + exit 1 +fi +chmod +x "${SUPERVISOR_BIN}" + +# ── Ensure a sandbox base image is available locally ──────────────── +# The bundled openshell-sandbox binary enforces a 'sandbox' user/group +# in the image. Use the community sandbox base image (also what real +# deployments default to). Callers can override with +# OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE if they have a smaller local image +# with the required 'sandbox' user. +SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base:latest}" +if ! docker image inspect "${SANDBOX_IMAGE}" >/dev/null 2>&1; then + echo "Pulling ${SANDBOX_IMAGE}..." + docker pull "${SANDBOX_IMAGE}" +fi + +# ── Generate ephemeral mTLS PKI ────────────────────────────────────── +PKI_DIR="${WORKDIR}/pki" +mkdir -p "${PKI_DIR}" +cd "${PKI_DIR}" + +cat > openssl.cnf <<'EOF' +[req] +distinguished_name = dn +prompt = no +[dn] +CN = openshell-server +[san_server] +subjectAltName = @alt_server +[alt_server] +DNS.1 = localhost +DNS.2 = host.openshell.internal +DNS.3 = host.docker.internal +IP.1 = 127.0.0.1 +IP.2 = ::1 +[san_client] +subjectAltName = DNS:openshell-client +EOF + +openssl req -x509 -newkey rsa:2048 -nodes -days 30 \ + -keyout ca.key -out ca.crt -subj "/CN=openshell-e2e-ca" >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout server.key -out server.csr \ + -config openssl.cnf >/dev/null 2>&1 +openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out server.crt -days 30 -extfile openssl.cnf -extensions san_server >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout client.key -out client.csr \ + -subj "/CN=openshell-client" >/dev/null 2>&1 +openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out client.crt -days 30 -extfile openssl.cnf -extensions san_client >/dev/null 2>&1 + +cd "${ROOT}" + +# ── Pick a free port ───────────────────────────────────────────────── +pick_port() { + python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' +} +HOST_PORT=$(pick_port) + +STATE_DIR="${WORKDIR}/state" +mkdir -p "${STATE_DIR}" + +# Containers started by the docker driver reach the host gateway via +# host.openshell.internal (mapped to host-gateway by the driver). The +# gateway itself binds to 0.0.0.0:${HOST_PORT}. +GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" + +# Unique per-run sandbox namespace. The Docker driver stamps every +# container with `openshell.ai/sandbox-namespace=`, so cleanup can +# filter on this value and never touch sandboxes belonging to other +# gateways or test runs on the same host. +E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" + +echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +"${GATEWAY_BIN}" \ + --port "${HOST_PORT}" \ + --drivers docker \ + --sandbox-namespace "${E2E_NAMESPACE}" \ + --tls-cert "${PKI_DIR}/server.crt" \ + --tls-key "${PKI_DIR}/server.key" \ + --tls-client-ca "${PKI_DIR}/ca.crt" \ + --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ + --grpc-endpoint "${GATEWAY_ENDPOINT}" \ + --docker-supervisor-bin "${SUPERVISOR_BIN}" \ + --docker-tls-ca "${PKI_DIR}/ca.crt" \ + --docker-tls-cert "${PKI_DIR}/client.crt" \ + --docker-tls-key "${PKI_DIR}/client.key" \ + --sandbox-image "${SANDBOX_IMAGE}" \ + --sandbox-image-pull-policy IfNotPresent \ + >"${GATEWAY_LOG}" 2>&1 & +GATEWAY_PID=$! + +# ── Register the gateway with the CLI ──────────────────────────────── +# Writes both metadata.json (for `--gateway ` lookup) and the mTLS +# bundle the CLI uses to authenticate to the gateway. +GATEWAY_NAME="openshell-e2e-docker-${HOST_PORT}" +CLI_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" +GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}" +mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" +cp "${PKI_DIR}/ca.crt" "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" +cp "${PKI_DIR}/client.crt" "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" +cp "${PKI_DIR}/client.key" "${GATEWAY_CONFIG_DIR}/mtls/tls.key" +cat >"${GATEWAY_CONFIG_DIR}/metadata.json" </dev/null; then + echo "ERROR: openshell-gateway exited before becoming healthy" + exit 1 + fi + if "${CLI_BIN}" status >/dev/null 2>&1; then + echo "Gateway healthy after ${elapsed}s." + break + fi + sleep 2 + elapsed=$((elapsed + 2)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: gateway did not become healthy within ${timeout}s" + exit 1 +fi + +# ── Run the smoke test ─────────────────────────────────────────────── +echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${CLI_GATEWAY_ENDPOINT})..." +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture + +echo "Smoke test passed." diff --git a/tasks/gateway.toml b/tasks/gateway.toml new file mode 100644 index 000000000..3f7a684d2 --- /dev/null +++ b/tasks/gateway.toml @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Standalone gateway tasks + +["gateway:docker"] +description = "Run a standalone gateway with the bundled Docker compute driver" +run = "bash tasks/scripts/gateway-docker.sh" diff --git a/tasks/scripts/gateway-docker.sh b/tasks/scripts/gateway-docker.sh new file mode 100644 index 000000000..23527741f --- /dev/null +++ b/tasks/scripts/gateway-docker.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Start a standalone openshell-gateway backed by the Docker compute driver for +# local manual testing. +# +# Defaults: +# - Plaintext HTTP on 127.0.0.1:18080 +# - Dedicated sandbox namespace "docker-dev" +# - Persistent state under .cache/gateway-docker +# +# Common overrides: +# OPENSHELL_SERVER_PORT=19080 mise run gateway:docker +# OPENSHELL_DOCKER_GATEWAY_NAME=my-docker-gateway mise run gateway:docker +# OPENSHELL_SANDBOX_NAMESPACE=my-ns mise run gateway:docker +# OPENSHELL_SANDBOX_IMAGE=ghcr.io/... mise run gateway:docker +# +# After the gateway is running, point the CLI at it with either: +# openshell --gateway docker-dev +# openshell gateway use docker-dev # then plain `openshell ` + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +PORT="${OPENSHELL_SERVER_PORT:-18080}" +GATEWAY_NAME="${OPENSHELL_DOCKER_GATEWAY_NAME:-docker-dev}" +STATE_DIR="${OPENSHELL_DOCKER_GATEWAY_STATE_DIR:-${ROOT}/.cache/gateway-docker}" +SANDBOX_NAMESPACE="${OPENSHELL_SANDBOX_NAMESPACE:-docker-dev}" +SANDBOX_IMAGE="${OPENSHELL_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base:latest}" +SANDBOX_IMAGE_PULL_POLICY="${OPENSHELL_SANDBOX_IMAGE_PULL_POLICY:-IfNotPresent}" +LOG_LEVEL="${OPENSHELL_LOG_LEVEL:-info}" +GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" + +normalize_arch() { + case "$1" in + x86_64|amd64) echo "amd64" ;; + aarch64|arm64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +linux_target_triple() { + case "$1" in + amd64) echo "x86_64-unknown-linux-gnu" ;; + arm64) echo "aarch64-unknown-linux-gnu" ;; + *) + echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 + exit 2 + ;; + esac +} + +port_is_in_use() { + local port=$1 + if command -v lsof >/dev/null 2>&1; then + lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null 2>&1 + return $? + fi + if command -v nc >/dev/null 2>&1; then + nc -z 127.0.0.1 "${port}" >/dev/null 2>&1 + return $? + fi + (echo >/dev/tcp/127.0.0.1/"${port}") >/dev/null 2>&1 +} + +register_gateway_metadata() { + local name=$1 + local endpoint=$2 + local port=$3 + local config_home gateway_dir + + config_home="${XDG_CONFIG_HOME:-${HOME}/.config}" + gateway_dir="${config_home}/openshell/gateways/${name}" + + mkdir -p "${gateway_dir}" + cat >"${gateway_dir}/metadata.json" <&2 + exit 2 +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "ERROR: docker CLI is required" >&2 + exit 2 +fi +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable" >&2 + exit 2 +fi + +if port_is_in_use "${PORT}"; then + echo "ERROR: port ${PORT} is already in use; free it or set OPENSHELL_SERVER_PORT" >&2 + exit 2 +fi + +GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-http://host.openshell.internal:${PORT}}" + +DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" +HOST_OS="$(uname -s)" +HOST_ARCH="$(normalize_arch "$(uname -m)")" +SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" +# Cache the supervisor binary alongside the gateway state. Reuses the same +# Docker pipeline that builds the cluster supervisor image, so the cross- +# compile happens inside Linux containers — sidestepping macOS's per-process +# file-descriptor cap that breaks zig/ld for this many rlibs. +SUPERVISOR_OUT_DIR="${STATE_DIR}/supervisor/${DAEMON_ARCH}" +SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" + +CARGO_BUILD_JOBS_ARG=() +if [[ -n "${CARGO_BUILD_JOBS:-}" ]]; then + CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") +fi + +echo "Building openshell-gateway..." +cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-server --bin openshell-gateway + +echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." +if [[ "${HOST_OS}" == "Linux" && "${HOST_ARCH}" == "${DAEMON_ARCH}" ]]; then + # Native Linux build — no cross-toolchain required. + rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true + cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-sandbox --target "${SUPERVISOR_TARGET}" + mkdir -p "${SUPERVISOR_OUT_DIR}" + cp "${ROOT}/target/${SUPERVISOR_TARGET}/debug/openshell-sandbox" "${SUPERVISOR_BIN}" +else + # Cross-compile via the existing Docker pipeline. The supervisor-output + # stage in deploy/docker/Dockerfile.images extracts just the openshell- + # sandbox binary, with the actual link happening inside Linux containers + # where FD limits are not a problem. + # + # This task is gated on a working Docker daemon above, so pin the + # container-engine helper to docker — otherwise it auto-detects podman + # whenever the binary happens to be on PATH. + mkdir -p "${SUPERVISOR_OUT_DIR}" + CONTAINER_ENGINE=docker \ + DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ + DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output +fi + +if [[ ! -f "${SUPERVISOR_BIN}" ]]; then + echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 + exit 1 +fi +chmod +x "${SUPERVISOR_BIN}" + +mkdir -p "${STATE_DIR}" + +GATEWAY_ENDPOINT="http://127.0.0.1:${PORT}" +register_gateway_metadata "${GATEWAY_NAME}" "${GATEWAY_ENDPOINT}" "${PORT}" + +echo "Starting standalone Docker gateway..." +echo " gateway: ${GATEWAY_NAME}" +echo " endpoint: ${GATEWAY_ENDPOINT}" +echo " namespace: ${SANDBOX_NAMESPACE}" +echo " state dir: ${STATE_DIR}" +echo +echo "Point the CLI at this gateway with one of:" +echo " openshell --gateway ${GATEWAY_NAME} status" +echo " openshell gateway select ${GATEWAY_NAME}" +echo + +exec "${GATEWAY_BIN}" \ + --port "${PORT}" \ + --log-level "${LOG_LEVEL}" \ + --drivers docker \ + --disable-tls \ + --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ + --sandbox-namespace "${SANDBOX_NAMESPACE}" \ + --sandbox-image "${SANDBOX_IMAGE}" \ + --sandbox-image-pull-policy "${SANDBOX_IMAGE_PULL_POLICY}" \ + --grpc-endpoint "${GRPC_ENDPOINT}" \ + --docker-supervisor-bin "${SUPERVISOR_BIN}" diff --git a/tasks/test.toml b/tasks/test.toml index 177bf5a58..5a7f306bc 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -56,3 +56,7 @@ run = "e2e/rust/e2e-podman.sh" ["e2e:vm"] description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" + +["e2e:docker"] +description = "Run smoke e2e against a standalone gateway with the Docker compute driver" +run = "e2e/rust/e2e-docker.sh"